linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/9] sched: balance callbacks
@ 2015-06-03 13:29 Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 1/9] sched: Replace post_schedule with a balance callback list Peter Zijlstra
                   ` (8 more replies)
  0 siblings, 9 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.

The exact scenario is __sched_setscheduler() changing a (runnable) task from
FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
drop rq->lock. This gap allows regular cfs load-balancing to step in and
migrate our task.

However, check_class_changed() will happily continue with switched_to_fair()
which assumes our task is still on the old rq and makes the kernel go boom.

Instead of trying to patch this up and make things complicated; simply disallow
these methods to drop rq->lock and extend the current post_schedule stuff into
a balancing callback list, and use that.

This survives Mike's testcase.

Changes since -v1:
 - make SMP=n build,
 - cured switched_from_dl()'s cancel_dl_timer().

no real tests on the new parts other than booting.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/9] sched: Replace post_schedule with a balance callback list
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 2/9] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-1.patch --]
[-- Type: text/plain, Size: 7096 bytes --]

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c |   21 +++++++++++----------
 kernel/sched/rt.c       |   25 +++++++++++--------------
 kernel/sched/sched.h    |   19 +++++++++++++++++--
 4 files changed, 63 insertions(+), 38 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7216,7 +7228,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_stru
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/9] sched: Use replace normalize_task() with __sched_setscheduler()
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 1/9] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 3/9] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-6.patch --]
[-- Type: text/plain, Size: 3953 bytes --]

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   65 ++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3428,7 +3428,7 @@ static bool dl_param_changed(struct task
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3614,18 +3614,20 @@ static int __sched_setscheduler(struct t
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3636,7 +3638,7 @@ static int __sched_setscheduler(struct t
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3651,7 +3653,8 @@ static int __sched_setscheduler(struct t
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3672,7 +3675,7 @@ static int _sched_setscheduler(struct ta
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3693,7 +3696,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7354,32 +7357,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7406,9 +7389,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/9] sched: Allow balance callbacks for check_class_changed()
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 1/9] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 2/9] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 4/9] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-7.patch --]
[-- Type: text/plain, Size: 3024 bytes --]

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_s
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3094,7 +3102,11 @@ void rt_mutex_setprio(struct task_struct
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3655,11 +3667,18 @@ static int __sched_setscheduler(struct t
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 4/9] sched,rt: Remove return value from pull_rt_task()
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 3/9] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 5/9] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-2.patch --]
[-- Type: text/plain, Size: 2408 bytes --]

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_gro
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(str
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct ir
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ static int pull_rt_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 5/9] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 4/9] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 6/9] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-3.patch --]
[-- Type: text/plain, Size: 3009 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct ta
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 6/9] sched,dl: Remove return value from pull_dl_task()
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 5/9] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 7/9] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-4.patch --]
[-- Type: text/plain, Size: 2135 bytes --]

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(str
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1049,7 +1048,7 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1480,15 +1479,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1543,7 +1543,7 @@ static int pull_dl_task(struct rq *this_
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1556,7 +1556,8 @@ static int pull_dl_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1712,8 +1713,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 7/9] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 6/9] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
  2015-06-03 13:29 ` [PATCH 9/9] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-5.patch --]
[-- Type: text/plain, Size: 3223 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -305,6 +312,10 @@ static inline void pull_dl_task(struct r
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1048,8 +1059,6 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1713,7 +1722,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1722,21 +1731,16 @@ static void switched_from_dl(struct rq *
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1756,15 +1760,14 @@ static void prio_changed_dl(struct rq *r
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (6 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 7/9] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  2015-06-03 16:26   ` Kirill Tkhai
  2015-06-03 17:41   ` Thomas Gleixner
  2015-06-03 13:29 ` [PATCH 9/9] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
  8 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-hrtimer-base-running.patch --]
[-- Type: text/plain, Size: 7749 bytes --]

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/hrtimer.h |   35 ++++++++++++++---------------------
 kernel/time/hrtimer.c   |   48 ++++++++++++++++++++++--------------------------
 2 files changed, 36 insertions(+), 47 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,34 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
+ * 0x02		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in timer->base->running == timer.
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
+#define HRTIMER_STATE_MIGRATE	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -153,6 +144,7 @@ struct hrtimer_clock_base {
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
 	ktime_t			offset;
+	struct hrtimer		*running;
 } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
 
 enum  hrtimer_base_type {
@@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
  */
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state != HRTIMER_STATE_INACTIVE;
+	return timer->state != HRTIMER_STATE_INACTIVE ||
+		timer->base->running == timer;
 }
 
 /*
@@ -419,7 +412,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -111,6 +111,13 @@ static inline int hrtimer_clockid_to_bas
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base.
+ */
+static struct hrtimer_clock_base migration_base;
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +126,8 @@ static inline int hrtimer_clockid_to_bas
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +137,7 @@ struct hrtimer_clock_base *lock_hrtimer_
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +201,8 @@ switch_hrtimer_base(struct hrtimer *time
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -840,11 +847,7 @@ static int enqueue_hrtimer(struct hrtime
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -894,7 +897,6 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
 		int reprogram;
 
 		/*
@@ -908,13 +910,8 @@ remove_hrtimer(struct hrtimer *timer, st
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
-		__remove_hrtimer(timer, base, state, reprogram);
+
+		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
 		return 1;
 	}
 	return 0;
@@ -1124,7 +1121,8 @@ static void __run_hrtimer(struct hrtimer
 	WARN_ON(!irqs_disabled());
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	base->running = timer;
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1140,7 +1138,7 @@ static void __run_hrtimer(struct hrtimer
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1152,9 +1150,8 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
-
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(base->running != timer);
+	base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
@@ -1523,11 +1520,10 @@ static void migrate_hrtimer_list(struct
 		 * hrtimer_interrupt after we migrated everything to
 		 * sort out already expired timers and reprogram the
 		 * event device.
+		 *
+		 * Sets timer->state = HRTIMER_STATE_ENQUEUED.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 9/9] sched,dl: Fix sched class hopping CBS hole
  2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
                   ` (7 preceding siblings ...)
  2015-06-03 13:29 ` [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-03 13:29 ` Peter Zijlstra
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 13:29 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Juri Lelli, Luca Abeni

[-- Attachment #1: peterz-sched-dl-cbs-bandwidth-timer.patch --]
[-- Type: text/plain, Size: 9066 bytes --]

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.
The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task left SCHED_DEADLINE it will not call task_dead_dl() and
we'll not cancel the hrtimer, leaving us a pending timer in free
space. Avoid this by giving the timer a task reference, this avoids
littering the task exit path for this rather uncommon case.

In order to do this, I had to move dl_task_offline_migration() below
the replenishment, such that the task_rq()->lock fully covers that.
While doing this, I noticed that it (was) buggy in assuming a task is
enqueued and or we need to enqueue the task now. Fixing this means
select_task_rq_dl() might encounter an offline rq -- look into that.

Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Cc: Luca Abeni <luca.abeni@unitn.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |  152 +++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 66 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -234,7 +234,7 @@ static inline void queue_pull_task(struc
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -268,14 +268,19 @@ static void dl_task_offline_migration(st
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -515,22 +520,23 @@ static void update_dl_entity(struct sche
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -542,7 +548,19 @@ static int start_dl_timer(struct sched_d
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
 	return 1;
 }
@@ -572,35 +590,40 @@ static enum hrtimer_restart dl_task_time
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -626,17 +649,38 @@ static enum hrtimer_restart dl_task_time
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -696,7 +740,7 @@ static void update_curr_dl(struct rq *rq
 	if (dl_runtime_exceeded(rq, dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1178,7 +1222,6 @@ static void task_fork_dl(struct task_str
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1188,8 +1231,6 @@ static void task_dead_dl(struct task_str
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1674,37 +1715,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 13:29 ` [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-03 16:26   ` Kirill Tkhai
  2015-06-03 21:13     ` Peter Zijlstra
  2015-06-03 17:41   ` Thomas Gleixner
  1 sibling, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-03 16:26 UTC (permalink / raw)
  To: Peter Zijlstra, umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel

[sorry for died formatting]

03.06.2015, 16:55, "Peter Zijlstra" <peterz@infradead.org>:
> Currently an hrtimer callback function cannot free its own timer
> because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
> after it. Freeing the timer would result in a clear use-after-free.
>
> Solve this by using a scheme similar to regular timers; track the
> current running timer in hrtimer_clock_base::running.
>
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/hrtimer.h |   35 ++++++++++++++---------------------
>  kernel/time/hrtimer.c   |   48 ++++++++++++++++++++++--------------------------
>  2 files changed, 36 insertions(+), 47 deletions(-)
>
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -53,34 +53,25 @@ enum hrtimer_restart {
>   *
>   * 0x00 inactive
>   * 0x01 enqueued into rbtree
> - * 0x02 callback function running
> - * 0x04 timer is migrated to another cpu
> + * 0x02 timer is migrated to another cpu
>   *
> - * Special cases:
> - * 0x03 callback function running and enqueued
> - * (was requeued on another CPU)
> - * 0x05 timer was migrated on CPU hotunplug
> + * The callback state is not part of the timer->state because clearing it would
> + * mean touching the timer after the callback, this makes it impossible to free
> + * the timer from the callback function.
>   *
> - * The "callback function running and enqueued" status is only possible on
> - * SMP. It happens for example when a posix timer expired and the callback
> + * Therefore we track the callback state in timer->base->running == timer.
> + *
> + * On SMP it is possible to have a "callback function running and enqueued"
> + * status. It happens for example when a posix timer expired and the callback
>   * queued a signal. Between dropping the lock which protects the posix timer
>   * and reacquiring the base lock of the hrtimer, another CPU can deliver the
> - * signal and rearm the timer. We have to preserve the callback running state,
> - * as otherwise the timer could be removed before the softirq code finishes the
> - * the handling of the timer.
> - *
> - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
> - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
> - * also affects HRTIMER_STATE_MIGRATE where the preservation is not
> - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
> - * enqueued on the new cpu.
> + * signal and rearm the timer.
>   *
>   * All state transitions are protected by cpu_base->lock.
>   */
>  #define HRTIMER_STATE_INACTIVE 0x00
>  #define HRTIMER_STATE_ENQUEUED 0x01
> -#define HRTIMER_STATE_CALLBACK 0x02
> -#define HRTIMER_STATE_MIGRATE 0x04
> +#define HRTIMER_STATE_MIGRATE 0x02
>
>  /**
>   * struct hrtimer - the basic hrtimer structure
> @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
>          struct timerqueue_head active;
>          ktime_t (*get_time)(void);
>          ktime_t offset;
> + struct hrtimer *running;
>  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>
>  enum  hrtimer_base_type {
> @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
>   */
>  static inline int hrtimer_active(const struct hrtimer *timer)
>  {
> - return timer->state != HRTIMER_STATE_INACTIVE;
> + return timer->state != HRTIMER_STATE_INACTIVE ||
> + timer->base->running == timer;
>  }

It seems to be not good, because hrtimer_active() check stops
to be atomic. So the things like hrtimer_try_to_cancel() race
with a callback of self-rearming timer and may return a false
positive result.

>  /*
> @@ -419,7 +412,7 @@ static inline int hrtimer_is_queued(stru
>   */
>  static inline int hrtimer_callback_running(struct hrtimer *timer)
>  {
> - return timer->state & HRTIMER_STATE_CALLBACK;
> + return timer->base->running == timer;
>  }
>
>  /* Forward a hrtimer so it expires after now: */
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -111,6 +111,13 @@ static inline int hrtimer_clockid_to_bas
>  #ifdef CONFIG_SMP
>
>  /*
> + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
> + * such that hrtimer_callback_running() can unconditionally dereference
> + * timer->base.
> + */
> +static struct hrtimer_clock_base migration_base;
> +
> +/*
>   * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
>   * means that all timers which are tied to this base via timer->base are
>   * locked, and the base itself is locked too.
> @@ -119,8 +126,8 @@ static inline int hrtimer_clockid_to_bas
>   * be found on the lists/queues.
>   *
>   * When the timer's base is locked, and the timer removed from list, it is
> - * possible to set timer->base = NULL and drop the lock: the timer remains
> - * locked.
> + * possible to set timer->base = &migration_base and drop the lock: the timer
> + * remains locked.
>   */
>  static
>  struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
> @@ -130,7 +137,7 @@ struct hrtimer_clock_base *lock_hrtimer_
>
>          for (;;) {
>                  base = timer->base;
> - if (likely(base != NULL)) {
> + if (likely(base != &migration_base)) {
>                          raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
>                          if (likely(base == timer->base))
>                                  return base;
> @@ -194,8 +201,8 @@ switch_hrtimer_base(struct hrtimer *time
>                  if (unlikely(hrtimer_callback_running(timer)))
>                          return base;
>
> - /* See the comment in lock_timer_base() */
> - timer->base = NULL;
> + /* See the comment in lock_hrtimer_base() */
> + timer->base = &migration_base;
>                  raw_spin_unlock(&base->cpu_base->lock);
>                  raw_spin_lock(&new_base->cpu_base->lock);
>
> @@ -840,11 +847,7 @@ static int enqueue_hrtimer(struct hrtime
>
>          base->cpu_base->active_bases |= 1 << base->index;
>
> - /*
> - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
> - * state of a possibly running callback.
> - */
> - timer->state |= HRTIMER_STATE_ENQUEUED;
> + timer->state = HRTIMER_STATE_ENQUEUED;
>
>          return timerqueue_add(&base->active, &timer->node);
>  }
> @@ -894,7 +897,6 @@ static inline int
>  remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>  {
>          if (hrtimer_is_queued(timer)) {
> - unsigned long state;
>                  int reprogram;
>
>                  /*
> @@ -908,13 +910,8 @@ remove_hrtimer(struct hrtimer *timer, st
>                  debug_deactivate(timer);
>                  timer_stats_hrtimer_clear_start_info(timer);
>                  reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
> - /*
> - * We must preserve the CALLBACK state flag here,
> - * otherwise we could move the timer base in
> - * switch_hrtimer_base.
> - */
> - state = timer->state & HRTIMER_STATE_CALLBACK;
> - __remove_hrtimer(timer, base, state, reprogram);
> +
> + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
>                  return 1;
>          }
>          return 0;
> @@ -1124,7 +1121,8 @@ static void __run_hrtimer(struct hrtimer
>          WARN_ON(!irqs_disabled());
>
>          debug_deactivate(timer);
> - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
> + base->running = timer;
> + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>          timer_stats_account_hrtimer(timer);
>          fn = timer->function;
>
> @@ -1140,7 +1138,7 @@ static void __run_hrtimer(struct hrtimer
>          raw_spin_lock(&cpu_base->lock);
>
>          /*
> - * Note: We clear the CALLBACK bit after enqueue_hrtimer and
> + * Note: We clear the running state after enqueue_hrtimer and
>           * we do not reprogramm the event hardware. Happens either in
>           * hrtimer_start_range_ns() or in hrtimer_interrupt()
>           *
> @@ -1152,9 +1150,8 @@ static void __run_hrtimer(struct hrtimer
>              !(timer->state & HRTIMER_STATE_ENQUEUED))
>                  enqueue_hrtimer(timer, base);
>
> - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
> -
> - timer->state &= ~HRTIMER_STATE_CALLBACK;
> + WARN_ON_ONCE(base->running != timer);
> + base->running = NULL;
>  }
>
>  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
> @@ -1523,11 +1520,10 @@ static void migrate_hrtimer_list(struct
>                   * hrtimer_interrupt after we migrated everything to
>                   * sort out already expired timers and reprogram the
>                   * event device.
> + *
> + * Sets timer->state = HRTIMER_STATE_ENQUEUED.
>                   */
>                  enqueue_hrtimer(timer, new_base);
> -
> - /* Clear the migration state bit */
> - timer->state &= ~HRTIMER_STATE_MIGRATE;
>          }
>  }
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 13:29 ` [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
  2015-06-03 16:26   ` Kirill Tkhai
@ 2015-06-03 17:41   ` Thomas Gleixner
  2015-06-03 21:29     ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: Thomas Gleixner @ 2015-06-03 17:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, wanpeng.li, linux-kernel

On Wed, 3 Jun 2015, Peter Zijlstra wrote:
>  /**
>   * struct hrtimer - the basic hrtimer structure
> @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
>  	struct timerqueue_head	active;
>  	ktime_t			(*get_time)(void);
>  	ktime_t			offset;
> +	struct hrtimer		*running;

Aside of lacking a KernelDoc comment, it expands the struct size on
32bit from 32 bytes to 36 bytes which undoes some of the recent cache
line optimizations I did. Mooo!

So we might think about storing the running timer pointer in cpu_base
instead for 32bit, which increases the foot print of the migration
base and the extra cost for the additional indirection, but it would
keep cache line tight for the hot pathes.

Other than that, this looks pretty good.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 16:26   ` Kirill Tkhai
@ 2015-06-03 21:13     ` Peter Zijlstra
  2015-06-04  9:07       ` Kirill Tkhai
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 21:13 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Wed, Jun 03, 2015 at 07:26:00PM +0300, Kirill Tkhai wrote:
> > @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
> >   */
> >  static inline int hrtimer_active(const struct hrtimer *timer)
> >  {
> > - return timer->state != HRTIMER_STATE_INACTIVE;
> > + return timer->state != HRTIMER_STATE_INACTIVE ||
> > + timer->base->running == timer;
> >  }
> 
> It seems to be not good, because hrtimer_active() check stops
> to be atomic. So the things like hrtimer_try_to_cancel() race
> with a callback of self-rearming timer and may return a false
> positive result.

Hurm.. the below isn't really pretty either I suppose. The best I can
say about it is that's its not too expensive on x86.

I should probably go sleep..

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
  * A timer is active, when it is enqueued into the rbtree or the
  * callback function is running or it's in the state of being migrated
  * to another cpu.
+ *
+ * See __run_hrtimer().
  */
-static inline int hrtimer_active(const struct hrtimer *timer)
+static inline bool hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state != HRTIMER_STATE_INACTIVE ||
-		timer->base->running == timer;
+	if (timer->state != HRTIMER_STATE_INACTIVE)
+		return true;
+
+	smp_rmb(); /* C matches A */
+
+	if (timer->base->running == timer)
+		return true;
+
+	smp_rmb(); /* D matches B */
+
+	if (timer->state != HRTIMER_STATE_INACTIVE)
+		return true;
+
+	return false;
 }
 
 /*
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1122,6 +1122,20 @@ static void __run_hrtimer(struct hrtimer
 
 	debug_deactivate(timer);
 	base->running = timer;
+
+	/*
+	 * Pairs with hrtimer_active().
+	 *
+	 *	[S] base->running = timer	[L] timer->state
+	 *	    WMB 			    RMB
+	 *	[S] timer->state = INACTIVE	[L] base->running
+	 *
+	 * BUG_ON(base->running != timer && timer->state != INACTIVE)
+	 *
+	 * If we observe INACTIVE we must observe base->running == timer.
+	 */
+	smp_wmb(); /* A matches C */
+
 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
@@ -1150,6 +1164,20 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
+	/*
+	 * Pairs with hrtimer_active().
+	 *
+	 *	[S] timer->state = ENQUEUED	[L] base->running
+	 *	    WMB 			    RMB
+	 *	[S] base->running = NULL	[L] timer->state
+	 *
+	 * BUG_ON(base->running == NULL && timer->state == INACTIVE)
+	 *
+	 * If we observe base->running == NULL, we must observe any preceding
+	 * enqueue.
+	 */
+	smp_wmb(); /* B matches D */
+
 	WARN_ON_ONCE(base->running != timer);
 	base->running = NULL;
 }

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 17:41   ` Thomas Gleixner
@ 2015-06-03 21:29     ` Peter Zijlstra
  2015-06-04  5:59       ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 21:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, wanpeng.li, linux-kernel

On Wed, Jun 03, 2015 at 07:41:43PM +0200, Thomas Gleixner wrote:
> On Wed, 3 Jun 2015, Peter Zijlstra wrote:
> >  /**
> >   * struct hrtimer - the basic hrtimer structure
> > @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> >  	struct timerqueue_head	active;
> >  	ktime_t			(*get_time)(void);
> >  	ktime_t			offset;
> > +	struct hrtimer		*running;
> 
> Aside of lacking a KernelDoc comment, it expands the struct size on
> 32bit from 32 bytes to 36 bytes which undoes some of the recent cache
> line optimizations I did. Mooo!
> 
> So we might think about storing the running timer pointer in cpu_base
> instead for 32bit, which increases the foot print of the migration
> base and the extra cost for the additional indirection, but it would
> keep cache line tight for the hot pathes.

A wee something like this then?

---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -123,8 +123,10 @@ struct hrtimer_sleeper {
 
 #ifdef CONFIG_64BIT
 # define HRTIMER_CLOCK_BASE_ALIGN	64
+# define __timer_base_running(timer)	timer->base->running
 #else
 # define HRTIMER_CLOCK_BASE_ALIGN	32
+# define __timer_base_running(timer)	timer->base->cpu_base->running
 #endif
 
 /**
@@ -136,6 +138,7 @@ struct hrtimer_sleeper {
  * @active:		red black tree root node for the active timers
  * @get_time:		function to retrieve the current time of the clock
  * @offset:		offset of this clock to the monotonic base
+ * @running:		pointer to the currently running hrtimer
  */
 struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
@@ -144,7 +147,9 @@ struct hrtimer_clock_base {
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
 	ktime_t			offset;
+#ifdef CONFIG_64BIT
 	struct hrtimer		*running;
+#endif
 } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
 
 enum  hrtimer_base_type {
@@ -162,6 +167,7 @@ enum  hrtimer_base_type {
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
+ * @running:		pointer to the currently running hrtimer
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @next_timer:		Pointer to the first expiring timer
@@ -183,6 +189,9 @@ struct hrtimer_cpu_base {
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
+#ifndef CONFIG_64BIT
+	struct hrtimer			*running;
+#endif
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			in_hrtirq	: 1,
 					hres_active	: 1,
@@ -401,7 +410,7 @@ static inline bool hrtimer_active(const
 
 	smp_rmb(); /* C matches A */
 
-	if (timer->base->running == timer)
+	if (__timer_base_running(timer) == timer)
 		return true;
 
 	smp_rmb(); /* D matches B */
@@ -426,7 +435,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->base->running == timer;
+	return __timer_base_running(timer) == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -110,12 +110,20 @@ static inline int hrtimer_clockid_to_bas
  */
 #ifdef CONFIG_SMP
 
+#ifndef CONFIG_64BIT
+static struct hrtimer_cpu_base migration_cpu_base;
+
+#define MIGRATION_BASE_INIT { .cpu_base = &migration_cpu_base, }
+#else
+#define MIGRATION_BASE_INIT {}
+#endif
+
 /*
  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
  * such that hrtimer_callback_running() can unconditionally dereference
  * timer->base.
  */
-static struct hrtimer_clock_base migration_base;
+static struct hrtimer_clock_base migration_base = MIGRATION_BASE_INIT;
 
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -1121,7 +1129,7 @@ static void __run_hrtimer(struct hrtimer
 	WARN_ON(!irqs_disabled());
 
 	debug_deactivate(timer);
-	base->running = timer;
+	__timer_base_running(timer) = timer;
 
 	/*
 	 * Pairs with hrtimer_active().
@@ -1178,8 +1186,8 @@ static void __run_hrtimer(struct hrtimer
 	 */
 	smp_wmb(); /* B matches D */
 
-	WARN_ON_ONCE(base->running != timer);
-	base->running = NULL;
+	WARN_ON_ONCE(__timer_base_running(timer) != timer);
+	__timer_base_running(timer) = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 21:29     ` Peter Zijlstra
@ 2015-06-04  5:59       ` Ingo Molnar
  2015-06-04 10:07         ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2015-06-04  5:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jun 03, 2015 at 07:41:43PM +0200, Thomas Gleixner wrote:
> > On Wed, 3 Jun 2015, Peter Zijlstra wrote:
> > >  /**
> > >   * struct hrtimer - the basic hrtimer structure
> > > @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> > >  	struct timerqueue_head	active;
> > >  	ktime_t			(*get_time)(void);
> > >  	ktime_t			offset;
> > > +	struct hrtimer		*running;
> > 
> > Aside of lacking a KernelDoc comment, it expands the struct size on
> > 32bit from 32 bytes to 36 bytes which undoes some of the recent cache
> > line optimizations I did. Mooo!
> > 
> > So we might think about storing the running timer pointer in cpu_base
> > instead for 32bit, which increases the foot print of the migration
> > base and the extra cost for the additional indirection, but it would
> > keep cache line tight for the hot pathes.
> 
> A wee something like this then?
> 
> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
>  
>  #ifdef CONFIG_64BIT
>  # define HRTIMER_CLOCK_BASE_ALIGN	64
> +# define __timer_base_running(timer)	timer->base->running
>  #else
>  # define HRTIMER_CLOCK_BASE_ALIGN	32
> +# define __timer_base_running(timer)	timer->base->cpu_base->running
>  #endif

Please put it into the cpu_base on 64-bit as well: the base pointer is available 
already on 64-bit so there should be no measurable performance difference, and 
readability is a primary concern with all this code.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-03 21:13     ` Peter Zijlstra
@ 2015-06-04  9:07       ` Kirill Tkhai
  2015-06-04 10:49         ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-04  9:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

В Ср, 03/06/2015 в 23:13 +0200, Peter Zijlstra пишет:
On Wed, Jun 03, 2015 at 07:26:00PM +0300, Kirill Tkhai wrote:
> > > @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
> > >   */
> > >  static inline int hrtimer_active(const struct hrtimer *timer)
> > >  {
> > > - return timer->state != HRTIMER_STATE_INACTIVE;
> > > + return timer->state != HRTIMER_STATE_INACTIVE ||
> > > + timer->base->running == timer;
> > >  }
> > 
> > It seems to be not good, because hrtimer_active() check stops
> > to be atomic. So the things like hrtimer_try_to_cancel() race
> > with a callback of self-rearming timer and may return a false
> > positive result.
> 
> Hurm.. the below isn't really pretty either I suppose. The best I can
> say about it is that's its not too expensive on x86.
> 
> I should probably go sleep..
> 
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
>   * A timer is active, when it is enqueued into the rbtree or the
>   * callback function is running or it's in the state of being migrated
>   * to another cpu.
> + *
> + * See __run_hrtimer().
>   */
> -static inline int hrtimer_active(const struct hrtimer *timer)
> +static inline bool hrtimer_active(const struct hrtimer *timer)
>  {
> -	return timer->state != HRTIMER_STATE_INACTIVE ||
> -		timer->base->running == timer;
> +	if (timer->state != HRTIMER_STATE_INACTIVE)
> +		return true;
> +
> +	smp_rmb(); /* C matches A */
> +
> +	if (timer->base->running == timer)
> +		return true;
> +
> +	smp_rmb(); /* D matches B */
> +
> +	if (timer->state != HRTIMER_STATE_INACTIVE)
> +		return true;
> +
> +	return false;

This races with two sequential timer handlers. hrtimer_active()
is preemptible everywhere, and no guarantees that all three "if"
conditions check the same timer tick.

How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?

>  }
>  
>  /*
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -1122,6 +1122,20 @@ static void __run_hrtimer(struct hrtimer
>  
>  	debug_deactivate(timer);
>  	base->running = timer;
> +
> +	/*
> +	 * Pairs with hrtimer_active().
> +	 *
> +	 *	[S] base->running = timer	[L] timer->state
> +	 *	    WMB 			    RMB
> +	 *	[S] timer->state = INACTIVE	[L] base->running
> +	 *
> +	 * BUG_ON(base->running != timer && timer->state != INACTIVE)
> +	 *
> +	 * If we observe INACTIVE we must observe base->running == timer.
> +	 */
> +	smp_wmb(); /* A matches C */
> +
>  	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>  	timer_stats_account_hrtimer(timer);
>  	fn = timer->function;
> @@ -1150,6 +1164,20 @@ static void __run_hrtimer(struct hrtimer
>  	    !(timer->state & HRTIMER_STATE_ENQUEUED))
>  		enqueue_hrtimer(timer, base);
>  
> +	/*
> +	 * Pairs with hrtimer_active().
> +	 *
> +	 *	[S] timer->state = ENQUEUED	[L] base->running
> +	 *	    WMB 			    RMB
> +	 *	[S] base->running = NULL	[L] timer->state
> +	 *
> +	 * BUG_ON(base->running == NULL && timer->state == INACTIVE)
> +	 *
> +	 * If we observe base->running == NULL, we must observe any preceding
> +	 * enqueue.
> +	 */
> +	smp_wmb(); /* B matches D */
> +
>  	WARN_ON_ONCE(base->running != timer);
>  	base->running = NULL;
>  }
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04  5:59       ` Ingo Molnar
@ 2015-06-04 10:07         ` Peter Zijlstra
  2015-06-04 12:37           ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-04 10:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel

On Thu, Jun 04, 2015 at 07:59:30AM +0200, Ingo Molnar wrote:

> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
> >  
> >  #ifdef CONFIG_64BIT
> >  # define HRTIMER_CLOCK_BASE_ALIGN	64
> > +# define __timer_base_running(timer)	timer->base->running
> >  #else
> >  # define HRTIMER_CLOCK_BASE_ALIGN	32
> > +# define __timer_base_running(timer)	timer->base->cpu_base->running
> >  #endif
> 
> Please put it into the cpu_base on 64-bit as well: the base pointer is available 
> already on 64-bit so there should be no measurable performance difference, and 
> readability is a primary concern with all this code.

That's an extra pointer chase for no reason :-(

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04  9:07       ` Kirill Tkhai
@ 2015-06-04 10:49         ` Peter Zijlstra
  2015-06-04 10:55           ` Peter Zijlstra
  2015-06-05  9:02           ` Kirill Tkhai
  0 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-04 10:49 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> >   * A timer is active, when it is enqueued into the rbtree or the
> >   * callback function is running or it's in the state of being migrated
> >   * to another cpu.
> > + *
> > + * See __run_hrtimer().
> >   */
> > -static inline int hrtimer_active(const struct hrtimer *timer)
> > +static inline bool hrtimer_active(const struct hrtimer *timer)
> >  {
> > -	return timer->state != HRTIMER_STATE_INACTIVE ||
> > -		timer->base->running == timer;
> > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > +		return true;
> > +
> > +	smp_rmb(); /* C matches A */
> > +
> > +	if (timer->base->running == timer)
> > +		return true;
> > +
> > +	smp_rmb(); /* D matches B */
> > +
> > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > +		return true;
> > +
> > +	return false;
> 
> This races with two sequential timer handlers. hrtimer_active()
> is preemptible everywhere, and no guarantees that all three "if"
> conditions check the same timer tick.

Indeed.

> How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?

Ingo will like that because it means we already need to touch cpu_base.

But I think there's a problem there on timer migration, the timer can
migrate between bases while we do the seq read loop and then you can get
false positives on the different seqcount numbers.

We could of course do something like the below, but hrtimer_is_active()
is turning into quite the monster.

Needs more comments at the very least, its fully of trickery.

---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -59,7 +59,9 @@ enum hrtimer_restart {
  * mean touching the timer after the callback, this makes it impossible to free
  * the timer from the callback function.
  *
- * Therefore we track the callback state in timer->base->running == timer.
+ * Therefore we track the callback state in:
+ *
+ * 	timer->base->cpu_base->running == timer
  *
  * On SMP it is possible to have a "callback function running and enqueued"
  * status. It happens for example when a posix timer expired and the callback
@@ -144,7 +146,6 @@ struct hrtimer_clock_base {
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
 	ktime_t			offset;
-	struct hrtimer		*running;
 } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
 
 enum  hrtimer_base_type {
@@ -159,6 +160,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -180,6 +183,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
  */
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state != HRTIMER_STATE_INACTIVE ||
-		timer->base->running == timer;
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+	bool active;
+
+	do {
+		active = false;
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seqcount_lockdep_reader_access(&cpu_base->seq);
+		seq = raw_read_seqcount(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			active = true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return active;
 }
 
 /*
@@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->base->running == timer;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
 /*
  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
  * such that hrtimer_callback_running() can unconditionally dereference
- * timer->base.
+ * timer->base->cpu_base
  */
-static struct hrtimer_clock_base migration_base;
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base {
+	.cpu_base = &migration_cpu_base,
+};
 
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	base->running = timer;
+	cpu_base->running = timer;
+
+	/*
+	 * separate the ->running assignment from the ->state assignment
+	 */
+	write_seqcount_begin(&cpu_base->seq);
+
 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
@@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(base->running != timer);
-	base->running = NULL;
+	/*
+	 * separate the ->running assignment from the ->state assignment
+	 */
+	write_seqcount_end(&cpu_base->seq);
+
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04 10:49         ` Peter Zijlstra
@ 2015-06-04 10:55           ` Peter Zijlstra
  2015-06-04 10:58             ` Peter Zijlstra
  2015-06-05  9:02           ` Kirill Tkhai
  1 sibling, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-04 10:55 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Thu, Jun 04, 2015 at 12:49:02PM +0200, Peter Zijlstra wrote:
> Needs more comments at the very least, its fully of trickery.
> 
> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
>  	enum hrtimer_restart (*fn)(struct hrtimer *);
>  	int restart;
>  
> -	WARN_ON(!irqs_disabled());
> +	lockdep_assert_held(&cpu_base->lock);
>  
>  	debug_deactivate(timer);
> -	base->running = timer;
> +	cpu_base->running = timer;
> +
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_begin(&cpu_base->seq);

Maybe these need to be raw_write_seqcount_latch()..

> +
>  	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>  	timer_stats_account_hrtimer(timer);
>  	fn = timer->function;
> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
>  	    !(timer->state & HRTIMER_STATE_ENQUEUED))
>  		enqueue_hrtimer(timer, base);
>  
> -	WARN_ON_ONCE(base->running != timer);
> -	base->running = NULL;
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_end(&cpu_base->seq);
> +
> +	WARN_ON_ONCE(cpu_base->running != timer);
> +	cpu_base->running = NULL;
>  }
>  
>  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04 10:55           ` Peter Zijlstra
@ 2015-06-04 10:58             ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-04 10:58 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Thu, Jun 04, 2015 at 12:55:37PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 04, 2015 at 12:49:02PM +0200, Peter Zijlstra wrote:
> > Needs more comments at the very least, its fully of trickery.
> > 
> > @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
> >  	enum hrtimer_restart (*fn)(struct hrtimer *);
> >  	int restart;
> >  
> > -	WARN_ON(!irqs_disabled());
> > +	lockdep_assert_held(&cpu_base->lock);
> >  
> >  	debug_deactivate(timer);
> > -	base->running = timer;
> > +	cpu_base->running = timer;
> > +
> > +	/*
> > +	 * separate the ->running assignment from the ->state assignment
> > +	 */
> > +	write_seqcount_begin(&cpu_base->seq);
> 
> Maybe these need to be raw_write_seqcount_latch()..

I'm properly confusing my self, so let me write a little more detail;

I didn't think it needed the double wmb because:

	[S] running = timer;
	[S] seq++;
	    WMB
	[S] state = INACTIVE

I don't think it matters if we re-order the first two stores, since at
that point ->state is still ENQUEUED and we're good.

Similar for the case below, you can flip the seq increment and the
->running clear, but at that time ->state should already be ENQUEUED
again (if indeed the timer got re-armed), otherwise its not active
anymore.

> >  	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> >  	timer_stats_account_hrtimer(timer);
> >  	fn = timer->function;
> > @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
> >  	    !(timer->state & HRTIMER_STATE_ENQUEUED))
> >  		enqueue_hrtimer(timer, base);
> >  
> > -	WARN_ON_ONCE(base->running != timer);
> > -	base->running = NULL;
> > +	/*
> > +	 * separate the ->running assignment from the ->state assignment
> > +	 */
> > +	write_seqcount_end(&cpu_base->seq);
> > +
> > +	WARN_ON_ONCE(cpu_base->running != timer);
> > +	cpu_base->running = NULL;
> >  }
> >  
> >  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04 10:07         ` Peter Zijlstra
@ 2015-06-04 12:37           ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2015-06-04 12:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Jun 04, 2015 at 07:59:30AM +0200, Ingo Molnar wrote:
> 
> > > --- a/include/linux/hrtimer.h
> > > +++ b/include/linux/hrtimer.h
> > > @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
> > >  
> > >  #ifdef CONFIG_64BIT
> > >  # define HRTIMER_CLOCK_BASE_ALIGN	64
> > > +# define __timer_base_running(timer)	timer->base->running
> > >  #else
> > >  # define HRTIMER_CLOCK_BASE_ALIGN	32
> > > +# define __timer_base_running(timer)	timer->base->cpu_base->running
> > >  #endif
> > 
> > Please put it into the cpu_base on 64-bit as well: the base pointer is available 
> > already on 64-bit so there should be no measurable performance difference, and 
> > readability is a primary concern with all this code.
> 
> That's an extra pointer chase for no reason :-(

Only if we otherwise don't dereference cpu_base - is that the case in the relevant 
code paths?

If we already dereference cpu_base (say for the lock) and have its value loaded 
then it's totally equivalent to chasing down it in base->.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-04 10:49         ` Peter Zijlstra
  2015-06-04 10:55           ` Peter Zijlstra
@ 2015-06-05  9:02           ` Kirill Tkhai
  2015-06-05  9:03             ` Kirill Tkhai
  2015-06-05  9:10             ` Peter Zijlstra
  1 sibling, 2 replies; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-05  9:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

В Чт, 04/06/2015 в 12:49 +0200, Peter Zijlstra пишет:
On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > > --- a/include/linux/hrtimer.h
> > > +++ b/include/linux/hrtimer.h
> > > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> > >   * A timer is active, when it is enqueued into the rbtree or the
> > >   * callback function is running or it's in the state of being migrated
> > >   * to another cpu.
> > > + *
> > > + * See __run_hrtimer().
> > >   */
> > > -static inline int hrtimer_active(const struct hrtimer *timer)
> > > +static inline bool hrtimer_active(const struct hrtimer *timer)
> > >  {
> > > -	return timer->state != HRTIMER_STATE_INACTIVE ||
> > > -		timer->base->running == timer;
> > > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > > +		return true;
> > > +
> > > +	smp_rmb(); /* C matches A */
> > > +
> > > +	if (timer->base->running == timer)
> > > +		return true;
> > > +
> > > +	smp_rmb(); /* D matches B */
> > > +
> > > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > > +		return true;
> > > +
> > > +	return false;
> > 
> > This races with two sequential timer handlers. hrtimer_active()
> > is preemptible everywhere, and no guarantees that all three "if"
> > conditions check the same timer tick.
> 
> Indeed.
> 
> > How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
> 
> Ingo will like that because it means we already need to touch cpu_base.
> 
> But I think there's a problem there on timer migration, the timer can
> migrate between bases while we do the seq read loop and then you can get
> false positives on the different seqcount numbers.
> 
> We could of course do something like the below, but hrtimer_is_active()
> is turning into quite the monster.
> 
> Needs more comments at the very least, its fully of trickery.

Yeah, it's safe for now, but it may happen difficulties with a support
in the future, because barrier logic is not easy to review. But it seems
we may simplify it a little bit. Please, see the comments below.

> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -59,7 +59,9 @@ enum hrtimer_restart {
>   * mean touching the timer after the callback, this makes it impossible to free
>   * the timer from the callback function.
>   *
> - * Therefore we track the callback state in timer->base->running == timer.
> + * Therefore we track the callback state in:
> + *
> + * 	timer->base->cpu_base->running == timer
>   *
>   * On SMP it is possible to have a "callback function running and enqueued"
>   * status. It happens for example when a posix timer expired and the callback
> @@ -144,7 +146,6 @@ struct hrtimer_clock_base {
>  	struct timerqueue_head	active;
>  	ktime_t			(*get_time)(void);
>  	ktime_t			offset;
> -	struct hrtimer		*running;
>  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>  
>  enum  hrtimer_base_type {
> @@ -159,6 +160,8 @@ enum  hrtimer_base_type {
>   * struct hrtimer_cpu_base - the per cpu clock bases
>   * @lock:		lock protecting the base and associated clock bases
>   *			and timers
> + * @seq:		seqcount around __run_hrtimer
> + * @running:		pointer to the currently running hrtimer
>   * @cpu:		cpu number
>   * @active_bases:	Bitfield to mark bases with active timers
>   * @clock_was_set_seq:	Sequence counter of clock was set events
> @@ -180,6 +183,8 @@ enum  hrtimer_base_type {
>   */
>  struct hrtimer_cpu_base {
>  	raw_spinlock_t			lock;
> +	seqcount_t			seq;
> +	struct hrtimer			*running;
>  	unsigned int			cpu;
>  	unsigned int			active_bases;
>  	unsigned int			clock_was_set_seq;
> @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>   */
>  static inline int hrtimer_active(const struct hrtimer *timer)
>  {
> -	return timer->state != HRTIMER_STATE_INACTIVE ||
> -		timer->base->running == timer;
> +	struct hrtimer_cpu_base *cpu_base;
> +	unsigned int seq;
> +	bool active;
> +
> +	do {
> +		active = false;
> +		cpu_base = READ_ONCE(timer->base->cpu_base);
> +		seqcount_lockdep_reader_access(&cpu_base->seq);
> +		seq = raw_read_seqcount(&cpu_base->seq);
> +
> +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> +		    cpu_base->running == timer)
> +			active = true;
> +
> +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> +		 cpu_base != READ_ONCE(timer->base->cpu_base));
> +
> +	return active;
>  }

This may race with migrate_hrtimer_list(), so it needs write seqcounter
too.

>  
>  /*
> @@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
>   */
>  static inline int hrtimer_callback_running(struct hrtimer *timer)
>  {
> -	return timer->base->running == timer;
> +	return timer->base->cpu_base->running == timer;
>  }
>  
>  /* Forward a hrtimer so it expires after now: */
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -67,6 +67,7 @@
>  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
>  {
>  	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
> +	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
>  	.clock_base =
>  	{
>  		{
> @@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
>  /*
>   * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
>   * such that hrtimer_callback_running() can unconditionally dereference
> - * timer->base.
> + * timer->base->cpu_base
>   */
> -static struct hrtimer_clock_base migration_base;
> +static struct hrtimer_cpu_base migration_cpu_base = {
> +	.seq = SEQCNT_ZERO(migration_cpu_base),
> +};
> +
> +static struct hrtimer_clock_base migration_base {
> +	.cpu_base = &migration_cpu_base,
> +};
>  
>  /*
>   * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
>  	enum hrtimer_restart (*fn)(struct hrtimer *);
>  	int restart;
>  
> -	WARN_ON(!irqs_disabled());
> +	lockdep_assert_held(&cpu_base->lock);
>  
>  	debug_deactivate(timer);
> -	base->running = timer;
> +	cpu_base->running = timer;

My suggestion is do not use seqcounters for long parts of code, and implement
short primitives for changing timer state and cpu_base running timer. Something
like this:

static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
{
	struct hrtimer_cpu_base	*cpu_base = timer->base->cpu_base;

	lockdep_assert_held(&cpu_base->lock);

	write_seqcount_begin(&cpu_base->seq);
	timer->state = state;
	write_seqcount_end(&cpu_base->seq);
}

static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
					struct hrtimer *timer)
{
	lockdep_assert_held(&cpu_base->lock);

	write_seqcount_begin(&cpu_base->seq);
	cpu_base->running = timer;
	write_seqcount_end(&cpu_base->seq);
}

Implemented this, we may less think about right barrier order, because
all changes are being made under seqcount.

static inline int hrtimer_active(const struct hrtimer *timer)
{
	struct hrtimer_cpu_base *cpu_base;
	struct hrtimer_clock_base *base;
	unsigned int seq;
	bool active = false;

	do {
		base = READ_ONCE(timer->base);
		if (base == &migration_base) {
			active = true;
			break;
		}

		cpu_base = base->cpu_base;
		seqcount_lockdep_reader_access(&cpu_base->seq);
		seq = raw_read_seqcount(&cpu_base->seq);

		if (timer->state != HRTIMER_STATE_INACTIVE ||
		    cpu_base->running == timer) {
			active = true;
			break;
		}
	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
		 READ_ONCE(timer->base) != base);

	return active;
}


> +
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_begin(&cpu_base->seq);
> +
>  	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>  	timer_stats_account_hrtimer(timer);
>  	fn = timer->function;
> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
>  	    !(timer->state & HRTIMER_STATE_ENQUEUED))
>  		enqueue_hrtimer(timer, base);
>  
> -	WARN_ON_ONCE(base->running != timer);
> -	base->running = NULL;
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_end(&cpu_base->seq);
> +
> +	WARN_ON_ONCE(cpu_base->running != timer);
> +	cpu_base->running = NULL;
>  }
>  
>  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  9:02           ` Kirill Tkhai
@ 2015-06-05  9:03             ` Kirill Tkhai
  2015-06-05  9:11               ` Peter Zijlstra
  2015-06-05  9:10             ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-05  9:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

This message is too late, /me going to see new series :)

05.06.2015, 12:02, "Kirill Tkhai" <tkhai@yandex.ru>:
> В Чт, 04/06/2015 в 12:49 +0200, Peter Zijlstra пишет:
> On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
>>>>  --- a/include/linux/hrtimer.h
>>>>  +++ b/include/linux/hrtimer.h
>>>>  @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
>>>>    * A timer is active, when it is enqueued into the rbtree or the
>>>>    * callback function is running or it's in the state of being migrated
>>>>    * to another cpu.
>>>>  + *
>>>>  + * See __run_hrtimer().
>>>>    */
>>>>  -static inline int hrtimer_active(const struct hrtimer *timer)
>>>>  +static inline bool hrtimer_active(const struct hrtimer *timer)
>>>>   {
>>>>  - return timer->state != HRTIMER_STATE_INACTIVE ||
>>>>  - timer->base->running == timer;
>>>>  + if (timer->state != HRTIMER_STATE_INACTIVE)
>>>>  + return true;
>>>>  +
>>>>  + smp_rmb(); /* C matches A */
>>>>  +
>>>>  + if (timer->base->running == timer)
>>>>  + return true;
>>>>  +
>>>>  + smp_rmb(); /* D matches B */
>>>>  +
>>>>  + if (timer->state != HRTIMER_STATE_INACTIVE)
>>>>  + return true;
>>>>  +
>>>>  + return false;
>>>  This races with two sequential timer handlers. hrtimer_active()
>>>  is preemptible everywhere, and no guarantees that all three "if"
>>>  conditions check the same timer tick.
>>  Indeed.
>>>  How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
>>  Ingo will like that because it means we already need to touch cpu_base.
>>
>>  But I think there's a problem there on timer migration, the timer can
>>  migrate between bases while we do the seq read loop and then you can get
>>  false positives on the different seqcount numbers.
>>
>>  We could of course do something like the below, but hrtimer_is_active()
>>  is turning into quite the monster.
>>
>>  Needs more comments at the very least, its fully of trickery.
>
> Yeah, it's safe for now, but it may happen difficulties with a support
> in the future, because barrier logic is not easy to review. But it seems
> we may simplify it a little bit. Please, see the comments below.
>>  ---
>>  --- a/include/linux/hrtimer.h
>>  +++ b/include/linux/hrtimer.h
>>  @@ -59,7 +59,9 @@ enum hrtimer_restart {
>>    * mean touching the timer after the callback, this makes it impossible to free
>>    * the timer from the callback function.
>>    *
>>  - * Therefore we track the callback state in timer->base->running == timer.
>>  + * Therefore we track the callback state in:
>>  + *
>>  + * timer->base->cpu_base->running == timer
>>    *
>>    * On SMP it is possible to have a "callback function running and enqueued"
>>    * status. It happens for example when a posix timer expired and the callback
>>  @@ -144,7 +146,6 @@ struct hrtimer_clock_base {
>>           struct timerqueue_head active;
>>           ktime_t (*get_time)(void);
>>           ktime_t offset;
>>  - struct hrtimer *running;
>>   } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>>
>>   enum  hrtimer_base_type {
>>  @@ -159,6 +160,8 @@ enum  hrtimer_base_type {
>>    * struct hrtimer_cpu_base - the per cpu clock bases
>>    * @lock: lock protecting the base and associated clock bases
>>    * and timers
>>  + * @seq: seqcount around __run_hrtimer
>>  + * @running: pointer to the currently running hrtimer
>>    * @cpu: cpu number
>>    * @active_bases: Bitfield to mark bases with active timers
>>    * @clock_was_set_seq: Sequence counter of clock was set events
>>  @@ -180,6 +183,8 @@ enum  hrtimer_base_type {
>>    */
>>   struct hrtimer_cpu_base {
>>           raw_spinlock_t lock;
>>  + seqcount_t seq;
>>  + struct hrtimer *running;
>>           unsigned int cpu;
>>           unsigned int active_bases;
>>           unsigned int clock_was_set_seq;
>>  @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>>    */
>>   static inline int hrtimer_active(const struct hrtimer *timer)
>>   {
>>  - return timer->state != HRTIMER_STATE_INACTIVE ||
>>  - timer->base->running == timer;
>>  + struct hrtimer_cpu_base *cpu_base;
>>  + unsigned int seq;
>>  + bool active;
>>  +
>>  + do {
>>  + active = false;
>>  + cpu_base = READ_ONCE(timer->base->cpu_base);
>>  + seqcount_lockdep_reader_access(&cpu_base->seq);
>>  + seq = raw_read_seqcount(&cpu_base->seq);
>>  +
>>  + if (timer->state != HRTIMER_STATE_INACTIVE ||
>>  +    cpu_base->running == timer)
>>  + active = true;
>>  +
>>  + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
>>  + cpu_base != READ_ONCE(timer->base->cpu_base));
>>  +
>>  + return active;
>>   }
>
> This may race with migrate_hrtimer_list(), so it needs write seqcounter
> too.
>>   /*
>>  @@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
>>    */
>>   static inline int hrtimer_callback_running(struct hrtimer *timer)
>>   {
>>  - return timer->base->running == timer;
>>  + return timer->base->cpu_base->running == timer;
>>   }
>>
>>   /* Forward a hrtimer so it expires after now: */
>>  --- a/kernel/time/hrtimer.c
>>  +++ b/kernel/time/hrtimer.c
>>  @@ -67,6 +67,7 @@
>>   DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
>>   {
>>           .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
>>  + .seq = SEQCNT_ZERO(hrtimer_bases.seq),
>>           .clock_base =
>>           {
>>                   {
>>  @@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
>>   /*
>>    * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
>>    * such that hrtimer_callback_running() can unconditionally dereference
>>  - * timer->base.
>>  + * timer->base->cpu_base
>>    */
>>  -static struct hrtimer_clock_base migration_base;
>>  +static struct hrtimer_cpu_base migration_cpu_base = {
>>  + .seq = SEQCNT_ZERO(migration_cpu_base),
>>  +};
>>  +
>>  +static struct hrtimer_clock_base migration_base {
>>  + .cpu_base = &migration_cpu_base,
>>  +};
>>
>>   /*
>>    * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
>>  @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
>>           enum hrtimer_restart (*fn)(struct hrtimer *);
>>           int restart;
>>
>>  - WARN_ON(!irqs_disabled());
>>  + lockdep_assert_held(&cpu_base->lock);
>>
>>           debug_deactivate(timer);
>>  - base->running = timer;
>>  + cpu_base->running = timer;
>
> My suggestion is do not use seqcounters for long parts of code, and implement
> short primitives for changing timer state and cpu_base running timer. Something
> like this:
>
> static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
> {
>         struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
>
>         lockdep_assert_held(&cpu_base->lock);
>
>         write_seqcount_begin(&cpu_base->seq);
>         timer->state = state;
>         write_seqcount_end(&cpu_base->seq);
> }
>
> static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
>                                         struct hrtimer *timer)
> {
>         lockdep_assert_held(&cpu_base->lock);
>
>         write_seqcount_begin(&cpu_base->seq);
>         cpu_base->running = timer;
>         write_seqcount_end(&cpu_base->seq);
> }
>
> Implemented this, we may less think about right barrier order, because
> all changes are being made under seqcount.
>
> static inline int hrtimer_active(const struct hrtimer *timer)
> {
>         struct hrtimer_cpu_base *cpu_base;
>         struct hrtimer_clock_base *base;
>         unsigned int seq;
>         bool active = false;
>
>         do {
>                 base = READ_ONCE(timer->base);
>                 if (base == &migration_base) {
>                         active = true;
>                         break;
>                 }
>
>                 cpu_base = base->cpu_base;
>                 seqcount_lockdep_reader_access(&cpu_base->seq);
>                 seq = raw_read_seqcount(&cpu_base->seq);
>
>                 if (timer->state != HRTIMER_STATE_INACTIVE ||
>                     cpu_base->running == timer) {
>                         active = true;
>                         break;
>                 }
>         } while (read_seqcount_retry(&cpu_base->seq, seq) ||
>                  READ_ONCE(timer->base) != base);
>
>         return active;
> }
>>  +
>>  + /*
>>  + * separate the ->running assignment from the ->state assignment
>>  + */
>>  + write_seqcount_begin(&cpu_base->seq);
>>  +
>>           __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>>           timer_stats_account_hrtimer(timer);
>>           fn = timer->function;
>>  @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
>>               !(timer->state & HRTIMER_STATE_ENQUEUED))
>>                   enqueue_hrtimer(timer, base);
>>
>>  - WARN_ON_ONCE(base->running != timer);
>>  - base->running = NULL;
>>  + /*
>>  + * separate the ->running assignment from the ->state assignment
>>  + */
>>  + write_seqcount_end(&cpu_base->seq);
>>  +
>>  + WARN_ON_ONCE(cpu_base->running != timer);
>>  + cpu_base->running = NULL;
>>   }
>>
>>   static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  9:02           ` Kirill Tkhai
  2015-06-05  9:03             ` Kirill Tkhai
@ 2015-06-05  9:10             ` Peter Zijlstra
  2015-06-05  9:27               ` Kirill Tkhai
  1 sibling, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-05  9:10 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Fri, Jun 05, 2015 at 12:02:01PM +0300, Kirill Tkhai wrote:
> Yeah, it's safe for now, but it may happen difficulties with a support
> in the future, because barrier logic is not easy to review. But it seems
> we may simplify it a little bit. Please, see the comments below.
> > @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
> >   */
> >  static inline int hrtimer_active(const struct hrtimer *timer)
> >  {
> > +	struct hrtimer_cpu_base *cpu_base;
> > +	unsigned int seq;
> > +	bool active;
> > +
> > +	do {
> > +		active = false;
> > +		cpu_base = READ_ONCE(timer->base->cpu_base);
> > +		seqcount_lockdep_reader_access(&cpu_base->seq);
> > +		seq = raw_read_seqcount(&cpu_base->seq);
> > +
> > +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> > +		    cpu_base->running == timer)
> > +			active = true;
> > +
> > +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > +		 cpu_base != READ_ONCE(timer->base->cpu_base));
> > +
> > +	return active;
> >  }
> 
> This may race with migrate_hrtimer_list(), so it needs write seqcounter
> too.

Let me go stare at that.

> My suggestion is do not use seqcounters for long parts of code, and implement
> short primitives for changing timer state and cpu_base running timer. Something
> like this:
> 
> static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
> {
> 	struct hrtimer_cpu_base	*cpu_base = timer->base->cpu_base;
> 
> 	lockdep_assert_held(&cpu_base->lock);
> 
> 	write_seqcount_begin(&cpu_base->seq);
> 	timer->state = state;
> 	write_seqcount_end(&cpu_base->seq);
> }
> 
> static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
> 					struct hrtimer *timer)
> {
> 	lockdep_assert_held(&cpu_base->lock);
> 
> 	write_seqcount_begin(&cpu_base->seq);
> 	cpu_base->running = timer;
> 	write_seqcount_end(&cpu_base->seq);
> }
> 
> Implemented this, we may less think about right barrier order, because
> all changes are being made under seqcount.

The problem with that is that it generates far more memory barriers,
while on x86 these are 'free', this is very much not so for other archs
like ARM / PPC etc.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  9:03             ` Kirill Tkhai
@ 2015-06-05  9:11               ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-05  9:11 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Fri, Jun 05, 2015 at 12:03:34PM +0300, Kirill Tkhai wrote:
> This message is too late, /me going to see new series :)

Never too late, and thanks for looking at them. The new patch is
basically the same as the last proposal (except build fixes and a
comment).

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  9:10             ` Peter Zijlstra
@ 2015-06-05  9:27               ` Kirill Tkhai
  0 siblings, 0 replies; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-05  9:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

05.06.2015, 12:10, "Peter Zijlstra" <peterz@infradead.org>:
> On Fri, Jun 05, 2015 at 12:02:01PM +0300, Kirill Tkhai wrote:
>>  Yeah, it's safe for now, but it may happen difficulties with a support
>>  in the future, because barrier logic is not easy to review. But it seems
>>  we may simplify it a little bit. Please, see the comments below.
>>>  @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>>>    */
>>>   static inline int hrtimer_active(const struct hrtimer *timer)
>>>   {
>>>  + struct hrtimer_cpu_base *cpu_base;
>>>  + unsigned int seq;
>>>  + bool active;
>>>  +
>>>  + do {
>>>  + active = false;
>>>  + cpu_base = READ_ONCE(timer->base->cpu_base);
>>>  + seqcount_lockdep_reader_access(&cpu_base->seq);
>>>  + seq = raw_read_seqcount(&cpu_base->seq);
>>>  +
>>>  + if (timer->state != HRTIMER_STATE_INACTIVE ||
>>>  +    cpu_base->running == timer)
>>>  + active = true;
>>>  +
>>>  + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
>>>  + cpu_base != READ_ONCE(timer->base->cpu_base));
>>>  +
>>>  + return active;
>>>   }
>>  This may race with migrate_hrtimer_list(), so it needs write seqcounter
>>  too.
>
> Let me go stare at that.
>>  My suggestion is do not use seqcounters for long parts of code, and implement
>>  short primitives for changing timer state and cpu_base running timer. Something
>>  like this:
>>
>>  static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
>>  {
>>          struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
>>
>>          lockdep_assert_held(&cpu_base->lock);
>>
>>          write_seqcount_begin(&cpu_base->seq);
>>          timer->state = state;
>>          write_seqcount_end(&cpu_base->seq);
>>  }
>>
>>  static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
>>                                          struct hrtimer *timer)
>>  {
>>          lockdep_assert_held(&cpu_base->lock);
>>
>>          write_seqcount_begin(&cpu_base->seq);
>>          cpu_base->running = timer;
>>          write_seqcount_end(&cpu_base->seq);
>>  }
>>
>>  Implemented this, we may less think about right barrier order, because
>>  all changes are being made under seqcount.
>
> The problem with that is that it generates far more memory barriers,
> while on x86 these are 'free', this is very much not so for other archs
> like ARM / PPC etc.

Ok, thanks.

One more way is to take write_seqcount every time we're taking base spin_lock,
thus we may group several smp_wmb() in a single. But this increases write
seqlocked code areas, and worsens the speed of hrtimer_active(), so it's not
good too.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-06-05  9:28 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-03 13:29 [PATCH 0/9] sched: balance callbacks Peter Zijlstra
2015-06-03 13:29 ` [PATCH 1/9] sched: Replace post_schedule with a balance callback list Peter Zijlstra
2015-06-03 13:29 ` [PATCH 2/9] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
2015-06-03 13:29 ` [PATCH 3/9] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
2015-06-03 13:29 ` [PATCH 4/9] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
2015-06-03 13:29 ` [PATCH 5/9] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
2015-06-03 13:29 ` [PATCH 6/9] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
2015-06-03 13:29 ` [PATCH 7/9] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
2015-06-03 13:29 ` [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
2015-06-03 16:26   ` Kirill Tkhai
2015-06-03 21:13     ` Peter Zijlstra
2015-06-04  9:07       ` Kirill Tkhai
2015-06-04 10:49         ` Peter Zijlstra
2015-06-04 10:55           ` Peter Zijlstra
2015-06-04 10:58             ` Peter Zijlstra
2015-06-05  9:02           ` Kirill Tkhai
2015-06-05  9:03             ` Kirill Tkhai
2015-06-05  9:11               ` Peter Zijlstra
2015-06-05  9:10             ` Peter Zijlstra
2015-06-05  9:27               ` Kirill Tkhai
2015-06-03 17:41   ` Thomas Gleixner
2015-06-03 21:29     ` Peter Zijlstra
2015-06-04  5:59       ` Ingo Molnar
2015-06-04 10:07         ` Peter Zijlstra
2015-06-04 12:37           ` Ingo Molnar
2015-06-03 13:29 ` [PATCH 9/9] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).