linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/14] sched: balance callbacks
@ 2015-06-05  8:48 Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 01/14] sched: Replace post_schedule with a balance callback list Peter Zijlstra
                   ` (13 more replies)
  0 siblings, 14 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.

The exact scenario is __sched_setscheduler() changing a (runnable) task from
FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
drop rq->lock. This gap allows regular cfs load-balancing to step in and
migrate our task.

However, check_class_changed() will happily continue with switched_to_fair()
which assumes our task is still on the old rq and makes the kernel go boom.

Instead of trying to patch this up and make things complicated; simply disallow
these methods to drop rq->lock and extend the current post_schedule stuff into
a balancing callback list, and use that.

This survives Mike's testcase.

Changes since -v2:
 - reworked the hrtimer patch. -- Kirill, tglx
 - added lock pinning

Changes since -v1:
 - make SMP=n build,
 - cured switched_from_dl()'s cancel_dl_timer().

no real tests on the new parts other than booting / building kernels.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 01/14] sched: Replace post_schedule with a balance callback list
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 02/14] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-1.patch --]
[-- Type: text/plain, Size: 7096 bytes --]

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c |   21 +++++++++++----------
 kernel/sched/rt.c       |   25 +++++++++++--------------
 kernel/sched/sched.h    |   19 +++++++++++++++++--
 4 files changed, 63 insertions(+), 38 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7216,7 +7228,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_stru
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 02/14] sched: Use replace normalize_task() with __sched_setscheduler()
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 01/14] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 03/14] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-6.patch --]
[-- Type: text/plain, Size: 3953 bytes --]

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   65 ++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3428,7 +3428,7 @@ static bool dl_param_changed(struct task
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3614,18 +3614,20 @@ static int __sched_setscheduler(struct t
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3636,7 +3638,7 @@ static int __sched_setscheduler(struct t
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3651,7 +3653,8 @@ static int __sched_setscheduler(struct t
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3672,7 +3675,7 @@ static int _sched_setscheduler(struct ta
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3693,7 +3696,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7354,32 +7357,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7406,9 +7389,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 03/14] sched: Allow balance callbacks for check_class_changed()
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 01/14] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 02/14] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 04/14] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-7.patch --]
[-- Type: text/plain, Size: 3024 bytes --]

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_s
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3094,7 +3102,11 @@ void rt_mutex_setprio(struct task_struct
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3655,11 +3667,18 @@ static int __sched_setscheduler(struct t
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 04/14] sched,rt: Remove return value from pull_rt_task()
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 03/14] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 05/14] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-2.patch --]
[-- Type: text/plain, Size: 2408 bytes --]

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_gro
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(str
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct ir
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ static int pull_rt_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 05/14] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 04/14] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 06/14] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-3.patch --]
[-- Type: text/plain, Size: 3009 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct ta
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 06/14] sched,dl: Remove return value from pull_dl_task()
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 05/14] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 07/14] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-4.patch --]
[-- Type: text/plain, Size: 2135 bytes --]

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(str
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1049,7 +1048,7 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1480,15 +1479,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1543,7 +1543,7 @@ static int pull_dl_task(struct rq *this_
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1556,7 +1556,8 @@ static int pull_dl_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1712,8 +1713,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 07/14] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 06/14] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-5.patch --]
[-- Type: text/plain, Size: 3223 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -305,6 +312,10 @@ static inline void pull_dl_task(struct r
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1048,8 +1059,6 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1713,7 +1722,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1722,21 +1731,16 @@ static void switched_from_dl(struct rq *
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1756,15 +1760,14 @@ static void prio_changed_dl(struct rq *r
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (6 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 07/14] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  9:48   ` Thomas Gleixner
                     ` (2 more replies)
  2015-06-05  8:48 ` [PATCH 09/14] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
                   ` (5 subsequent siblings)
  13 siblings, 3 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-hrtimer-base-running.patch --]
[-- Type: text/plain, Size: 11432 bytes --]

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/hrtimer.h |   47 ++++++-----------
 kernel/time/hrtimer.c   |  130 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 55 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,34 +53,27 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
+ * 0x02		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
+#define HRTIMER_STATE_MIGRATE	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -167,6 +160,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -188,6 +183,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -395,15 +392,7 @@ extern ktime_t hrtimer_get_remaining(con
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -419,7 +408,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -111,6 +112,19 @@ static inline int hrtimer_clockid_to_bas
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base = {
+	.cpu_base = &migration_cpu_base,
+};
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +133,8 @@ static inline int hrtimer_clockid_to_bas
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +144,7 @@ struct hrtimer_clock_base *lock_hrtimer_
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +208,8 @@ switch_hrtimer_base(struct hrtimer *time
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -840,11 +854,7 @@ static int enqueue_hrtimer(struct hrtime
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -894,7 +904,6 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
 		int reprogram;
 
 		/*
@@ -908,13 +917,8 @@ remove_hrtimer(struct hrtimer *timer, st
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
-		__remove_hrtimer(timer, base, state, reprogram);
+
+		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
 		return 1;
 	}
 	return 0;
@@ -1114,6 +1118,70 @@ void hrtimer_init(struct hrtimer *timer,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+	bool active;
+
+	do {
+		active = false;
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			active = true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return active;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ *     __run_hrtimer()			    hrtimer_active()
+ *
+ * [S] cpu_base->running = timer	[R] timer->base->cpu_base
+ * [S] seq++				[R] seq
+ *     WMB				    RMB
+ * [S] timer->state = INACTIVE
+ *					[R] timer->state
+ *	fn();				[R] cpu_base->running
+ *
+ * [S] timer->state = ENQUEUED (optional)
+ *     WMB				    RMB
+ * [S] seq++				[R] seq
+ * [S] cpu_base->running = NULL		[R] timer->base->cpu_base
+ *
+ *
+ * The WMBs+seq on the __run_hrtimer() split the thing into 3 distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * Note: this is unusual seqlock usage in that we do not have the odd/even
+ * thing, all we really care about is both reads happening in the same section
+ * of __run_hrtimer().
+ *
+ * Note: both seq increments are not fully store ordered and this is fine, if
+ * for example the first seq increment is observed before the ->running store
+ * the section edge shifts slightly, but its a safe shift because here ->state
+ * is still ENQUEUED.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1121,10 +1189,17 @@ static void __run_hrtimer(struct hrtimer
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * separate the ->running assignment from the ->state assignment
+	 */
+	raw_write_seqcount_begin(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1140,7 +1215,7 @@ static void __run_hrtimer(struct hrtimer
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1152,9 +1227,13 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * separate the ->running assignment from the ->state assignment
+	 */
+	raw_write_seqcount_end(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
@@ -1523,11 +1602,10 @@ static void migrate_hrtimer_list(struct
 		 * hrtimer_interrupt after we migrated everything to
 		 * sort out already expired timers and reprogram the
 		 * event device.
+		 *
+		 * Sets timer->state = HRTIMER_STATE_ENQUEUED.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 09/14] sched,dl: Fix sched class hopping CBS hole
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (7 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 10/14] sched: Move code around Peter Zijlstra
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Luca Abeni, Juri Lelli

[-- Attachment #1: peterz-sched-dl-cbs-bandwidth-timer.patch --]
[-- Type: text/plain, Size: 9142 bytes --]

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.
The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task left SCHED_DEADLINE it will not call task_dead_dl() and
we'll not cancel the hrtimer, leaving us a pending timer in free
space. Avoid this by giving the timer a task reference, this avoids
littering the task exit path for this rather uncommon case.

In order to do this, I had to move dl_task_offline_migration() below
the replenishment, such that the task_rq()->lock fully covers that.
While doing this, I noticed that it (was) buggy in assuming a task is
enqueued and or we need to enqueue the task now. Fixing this means
select_task_rq_dl() might encounter an offline rq -- look into that.

As a result this kills cancel_dl_timer() which included a rq->lock
break.

Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |  152 +++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 66 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -234,7 +234,7 @@ static inline void queue_pull_task(struc
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -268,14 +268,19 @@ static void dl_task_offline_migration(st
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -515,22 +520,23 @@ static void update_dl_entity(struct sche
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -542,7 +548,19 @@ static int start_dl_timer(struct sched_d
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
 	return 1;
 }
@@ -572,35 +590,40 @@ static enum hrtimer_restart dl_task_time
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -626,17 +649,38 @@ static enum hrtimer_restart dl_task_time
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -696,7 +740,7 @@ static void update_curr_dl(struct rq *rq
 	if (dl_runtime_exceeded(rq, dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1178,7 +1222,6 @@ static void task_fork_dl(struct task_str
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1188,8 +1231,6 @@ static void task_dead_dl(struct task_str
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1674,37 +1715,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 10/14] sched: Move code around
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (8 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 09/14] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 11/14] sched: Streamline the task migration locking a little Peter Zijlstra
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-move-smp.patch --]
[-- Type: text/plain, Size: 12382 bytes --]

In preparation to reworking set_cpus_allowed_ptr() move some code
around. This also removes some superfluous #ifdefs and adds comments
to some #endifs.

   text    data     bss     dec     hex filename
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.pre
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.post

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |  364 +++++++++++++++++++++++++---------------------------
 1 file changed, 178 insertions(+), 186 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1046,6 +1046,180 @@ void check_preempt_curr(struct rq *rq, s
 }
 
 #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+
+	dequeue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, new_cpu);
+	raw_spin_unlock(&rq->lock);
+
+	rq = cpu_rq(new_cpu);
+
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_cpu(p) != new_cpu);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	enqueue_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+
+	return rq;
+}
+
+struct migration_arg {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!cpu_active(dest_cpu)))
+		return ret;
+
+	rq = cpu_rq(src_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto done;
+
+	/* Affinity changed (again). */
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+		goto fail;
+
+	/*
+	 * If we're not on a rq, the next wake-up will ensure we're
+	 * placed properly.
+	 */
+	if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+done:
+	ret = 1;
+fail:
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+	return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+	struct migration_arg *arg = data;
+
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
+	return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int dest_cpu;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (cpumask_equal(&p->cpus_allowed, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	} else if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+out:
+	task_rq_unlock(rq, p, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1186,13 +1360,6 @@ int migrate_swap(struct task_struct *cur
 	return ret;
 }
 
-struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1325,9 +1492,7 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
@@ -1432,7 +1597,7 @@ static void update_avg(u64 *avg, u64 sam
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -4767,149 +4932,6 @@ int task_can_attach(struct task_struct *
 }
 
 #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-	struct rq *rq = task_rq(p);
-
-	lockdep_assert_held(&rq->lock);
-
-	dequeue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_MIGRATING;
-	set_task_cpu(p, new_cpu);
-	raw_spin_unlock(&rq->lock);
-
-	rq = cpu_rq(new_cpu);
-
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-
-	return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	unsigned long flags;
-	struct rq *rq;
-	unsigned int dest_cpu;
-	int ret = 0;
-
-	rq = task_rq_lock(p, &flags);
-
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
-		return 0;
-	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-out:
-	task_rq_unlock(rq, p, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	struct rq *rq;
-	int ret = 0;
-
-	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
-
-	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
-
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
-}
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -4957,35 +4979,9 @@ void sched_setnuma(struct task_struct *p
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-	struct migration_arg *arg = data;
-
-	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
-	 */
-	local_irq_disable();
-	/*
-	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
-	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-	 */
-	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-	local_irq_enable();
-	return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5088,7 +5084,6 @@ static void migrate_tasks(unsigned int d
 
 	rq->stop = stop;
 }
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5267,7 +5262,7 @@ static void register_sched_domain_sysctl
 static void unregister_sched_domain_sysctl(void)
 {
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
 
 static void set_rq_online(struct rq *rq)
 {
@@ -5414,9 +5409,6 @@ static int __init migration_init(void)
 	return 0;
 }
 early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
 
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
@@ -6642,7 +6634,7 @@ static int __sdt_alloc(const struct cpum
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
-		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 11/14] sched: Streamline the task migration locking a little
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (9 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 10/14] sched: Move code around Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 12/14] lockdep: Simplify lock_release() Peter Zijlstra
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-migrate-locking.patch --]
[-- Type: text/plain, Size: 4633 bytes --]

The whole migrate_task{,s}() locking seems a little shaky, there's a
lot of dropping an require happening. Pull the locking up into the
callers as far as possible to streamline the lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   76 +++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,10 +1065,8 @@ void check_preempt_curr(struct rq *rq, s
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
 {
-	struct rq *rq = task_rq(p);
-
 	lockdep_assert_held(&rq->lock);
 
 	dequeue_task(rq, p, 0);
@@ -1100,41 +1098,19 @@ struct migration_arg {
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
  */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
-	struct rq *rq;
-	int ret = 0;
-
 	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
+		return rq;
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
+		return rq;
 
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
+	rq = move_queued_task(rq, p, dest_cpu);
+
+	return rq;
 }
 
 /*
@@ -1145,6 +1121,8 @@ static int __migrate_task(struct task_st
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
+	struct task_struct *p = arg->task;
+	struct rq *rq = this_rq();
 
 	/*
 	 * The original target cpu might have gone down and we might
@@ -1157,7 +1135,19 @@ static int migration_cpu_stop(void *data
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/*
+	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+	 * we're holding p->pi_lock.
+	 */
+	if (task_rq(p) == rq && task_on_rq_queued(p))
+		rq = __migrate_task(rq, p, arg->dest_cpu);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+
 	local_irq_enable();
 	return 0;
 }
@@ -1212,7 +1202,7 @@ int set_cpus_allowed_ptr(struct task_str
 		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
+		rq = move_queued_task(rq, p, dest_cpu);
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -5049,9 +5039,9 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
+	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 
@@ -5073,7 +5063,7 @@ static void migrate_tasks(unsigned int d
 	 */
 	update_rq_clock(rq);
 
-	for ( ; ; ) {
+	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
@@ -5086,12 +5076,14 @@ static void migrate_tasks(unsigned int d
 		next->sched_class->put_prev_task(rq, next);
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_cpu, next);
-		raw_spin_unlock(&rq->lock);
-
-		__migrate_task(next, dead_cpu, dest_cpu);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		raw_spin_lock(&rq->lock);
+		rq = __migrate_task(rq, next, dest_cpu);
+		if (rq != dead_rq) {
+			raw_spin_unlock(&rq->lock);
+			rq = dead_rq;
+			raw_spin_lock(&rq->lock);
+		}
 	}
 
 	rq->stop = stop;
@@ -5343,7 +5335,7 @@ migration_call(struct notifier_block *nf
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(cpu);
+		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 12/14] lockdep: Simplify lock_release()
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (10 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 11/14] sched: Streamline the task migration locking a little Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 13/14] lockdep: Implement lock pinning Peter Zijlstra
  2015-06-05  8:48 ` [PATCH 14/14] sched,lockdep: Employ " Peter Zijlstra
  13 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-lockdep-simplify-release.patch --]
[-- Type: text/plain, Size: 4879 bytes --]

lock_release() takes this nested argument that's mostly pointless
these days, remove the implementation but leave the argument a
rudiment for now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/lockdep.c |  119 +++++++----------------------------------------
 1 file changed, 18 insertions(+), 101 deletions(-)

--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3260,26 +3260,6 @@ print_unlock_imbalance_bug(struct task_s
 	return 0;
 }
 
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
-			unsigned long ip)
-{
-	if (unlikely(!debug_locks))
-		return 0;
-	/*
-	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return 0;
-
-	if (curr->lockdep_depth <= 0)
-		return print_unlock_imbalance_bug(curr, lock, ip);
-
-	return 1;
-}
-
 static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 {
 	if (hlock->instance == lock)
@@ -3376,31 +3356,35 @@ __lock_set_class(struct lockdep_map *loc
 }
 
 /*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
  */
 static int
-lock_release_non_nested(struct task_struct *curr,
-			struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
+	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
 	unsigned int depth;
 	int i;
 
-	/*
-	 * Check whether the lock exists in the current stack
-	 * of held locks:
-	 */
+	if (unlikely(!debug_locks))
+		return 0;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
 	 * own any locks, you've been drinking again?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth))
-		return 0;
+	if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+		 return print_unlock_imbalance_bug(curr, lock, ip);
 
+	/*
+	 * Check whether the lock exists in the current stack
+	 * of held locks:
+	 */
 	prev_hlock = NULL;
 	for (i = depth-1; i >= 0; i--) {
 		hlock = curr->held_locks + i;
@@ -3456,78 +3440,10 @@ lock_release_non_nested(struct task_stru
 	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
 		return 0;
-	return 1;
-}
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
-			       struct lockdep_map *lock, unsigned long ip)
-{
-	struct held_lock *hlock;
-	unsigned int depth;
-
-	/*
-	 * Pop off the top of the lock stack:
-	 */
-	depth = curr->lockdep_depth - 1;
-	hlock = curr->held_locks + depth;
-
-	/*
-	 * Is the unlock non-nested:
-	 */
-	if (hlock->instance != lock || hlock->references)
-		return lock_release_non_nested(curr, lock, ip);
-	curr->lockdep_depth--;
-
-	/*
-	 * No more locks, but somehow we've got hash left over, who left it?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
-		return 0;
-
-	curr->curr_chain_key = hlock->prev_chain_key;
-
-	lock_release_holdtime(hlock);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	hlock->prev_chain_key = 0;
-	hlock->class_idx = 0;
-	hlock->acquire_ip = 0;
-	hlock->irq_context = 0;
-#endif
 	return 1;
 }
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
-{
-	struct task_struct *curr = current;
-
-	if (!check_unlock(curr, lock, ip))
-		return;
-
-	if (nested) {
-		if (!lock_release_nested(curr, lock, ip))
-			return;
-	} else {
-		if (!lock_release_non_nested(curr, lock, ip))
-			return;
-	}
-
-	check_chain_key(curr);
-}
-
 static int __lock_is_held(struct lockdep_map *lock)
 {
 	struct task_struct *curr = current;
@@ -3639,7 +3555,8 @@ void lock_release(struct lockdep_map *lo
 	check_flags(flags);
 	current->lockdep_recursion = 1;
 	trace_lock_release(lock, ip);
-	__lock_release(lock, nested, ip);
+	if (__lock_release(lock, nested, ip))
+		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 13/14] lockdep: Implement lock pinning
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (11 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 12/14] lockdep: Simplify lock_release() Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  9:55   ` Ingo Molnar
  2015-06-05  8:48 ` [PATCH 14/14] sched,lockdep: Employ " Peter Zijlstra
  13 siblings, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-lockdep-hold.patch --]
[-- Type: text/plain, Size: 4990 bytes --]

Add a lockdep annotation that WARNs if you 'accidentially' unlock a
lock.

This is especially helpful for code with callbacks, where the upper
layer assumes a lock remains taken but a lower layer thinks it maybe
can drop and reacquire the lock.

By unwittingly breaking up the lock, races can be introduced.

Lock pinning is a lockdep annotation that helps with this, when you
lockdep_pin_lock() a held lock, any unlock without a
lockdep_unpin_lock() will produce a WARN. Think of this as a relative
of lockdep_assert_held(), except you don't only assert its held now,
but ensure it stays held until you release your assertion.

RFC: a possible alternative API would be something like:

  int cookie = lockdep_pin_lock(&foo);
  ...
  lockdep_unpin_lock(&foo, cookie);

Where we pick a random number for the pin_count; this makes it
impossible to sneak a lock break in without also passing the right
cookie along.

I've not done this because it ends up generating code for !LOCKDEP,
esp. if you need to pass the cookie around for some reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/lockdep.h  |   10 +++++
 kernel/locking/lockdep.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -255,6 +255,7 @@ struct held_lock {
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
+	unsigned int pin_count;
 };
 
 /*
@@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
+extern void lock_pin_lock(struct lockdep_map *lock);
+extern void lock_unpin_lock(struct lockdep_map *lock);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t ma
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
+#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
+#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -420,6 +427,9 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
+#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
+#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
+	hlock->pin_count = 0;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3403,6 +3404,8 @@ __lock_release(struct lockdep_map *lock,
 	if (hlock->instance == lock)
 		lock_release_holdtime(hlock);
 
+	WARN(hlock->pin_count, "releasing a stayed lock\n");
+
 	if (hlock->references) {
 		hlock->references--;
 		if (hlock->references) {
@@ -3459,6 +3462,49 @@ static int __lock_is_held(struct lockdep
 	return 0;
 }
 
+static void __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			hlock->pin_count++;
+			return;
+		}
+	}
+
+	WARN(1, "staying unheld lock\n");
+}
+
+static void __lock_unpin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			if (WARN(!hlock->pin_count, "unstaying unstayed lock\n"))
+				return;
+
+			hlock->pin_count--;
+			return;
+		}
+	}
+
+	WARN(1, "unstaying unheld lock\n");
+}
+
 /*
  * Check whether we follow the irq-flags state precisely:
  */
@@ -3582,6 +3628,40 @@ int lock_is_held(struct lockdep_map *loc
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
+void lock_pin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_pin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_unpin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;



^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 14/14] sched,lockdep: Employ lock pinning
  2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
                   ` (12 preceding siblings ...)
  2015-06-05  8:48 ` [PATCH 13/14] lockdep: Implement lock pinning Peter Zijlstra
@ 2015-06-05  8:48 ` Peter Zijlstra
  2015-06-05  9:57   ` Ingo Molnar
  13 siblings, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05  8:48 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-stay-rq_lock.patch --]
[-- Type: text/plain, Size: 8209 bytes --]

Employ the new lockdep lock pinning annotation to ensure no
'accidental' lock-breaks happen with rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/deadline.c |    8 ++++++++
 kernel/sched/fair.c     |   11 ++++++++---
 kernel/sched/rt.c       |    8 ++++++++
 kernel/sched/sched.h    |   10 ++++++++--
 5 files changed, 71 insertions(+), 8 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1201,8 +1201,15 @@ int set_cpus_allowed_ptr(struct task_str
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
-	} else if (task_on_rq_queued(p))
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		rq = move_queued_task(rq, p, dest_cpu);
+		lockdep_pin_lock(&rq->lock);
+	}
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -1562,6 +1569,8 @@ static int select_fallback_rq(int cpu, s
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
@@ -1652,9 +1661,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
-		 * XXX can drop rq->lock; most likely ok.
+		 * Our task @p is fully woken up and running; so its safe to
+		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (rq->idle_stamp) {
@@ -1674,6 +1686,8 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
@@ -1718,6 +1732,7 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
+	lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1725,6 +1740,7 @@ void sched_ttwu_pending(void)
 		ttwu_do_activate(rq, p, 0);
 	}
 
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1821,7 +1837,9 @@ static void ttwu_queue(struct task_struc
 #endif
 
 	raw_spin_lock(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1916,9 +1934,17 @@ static void try_to_wake_up_local(struct
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2538,6 +2564,7 @@ context_switch(struct rq *rq, struct tas
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
+	lockdep_unpin_lock(&rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
@@ -2960,6 +2987,7 @@ static void __sched __schedule(void)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3002,8 +3030,10 @@ static void __sched __schedule(void)
 
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
-	} else
+	} else {
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
 	balance_callback(rq);
 }
@@ -5071,6 +5101,11 @@ static void migrate_tasks(struct rq *dea
 		if (rq->nr_running == 1)
 			break;
 
+		/*
+		 * Ensure rq->lock covers the entire task selection
+		 * until the migration.
+		 */
+		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
@@ -5078,6 +5113,7 @@ static void migrate_tasks(struct rq *dea
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
+		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1153,7 +1153,15 @@ struct task_struct *pick_next_task_dl(st
 	dl_rq = &rq->dl;
 
 	if (need_pull_dl_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,7 +5392,15 @@ pick_next_task_fair(struct rq *rq, struc
 	return p;
 
 idle:
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	lockdep_unpin_lock(&rq->lock);
 	new_tasks = idle_balance(rq);
+	lockdep_pin_lock(&rq->lock);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -7426,9 +7434,6 @@ static int idle_balance(struct rq *this_
 		goto out;
 	}
 
-	/*
-	 * Drop the rq->lock, but keep IRQ/preempt disabled.
-	 */
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1478,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct
 	struct rt_rq *rt_rq = &rq->rt;
 
 	if (need_pull_rt_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_rt_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1438,8 +1438,10 @@ static inline struct rq *__task_rq_lock(
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -1476,8 +1478,10 @@ static inline struct rq *task_rq_lock(st
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 
@@ -1489,6 +1493,7 @@ static inline struct rq *task_rq_lock(st
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1497,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct tas
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-05  9:48   ` Thomas Gleixner
  2015-06-07 19:43   ` Oleg Nesterov
  2015-06-07 22:33   ` Oleg Nesterov
  2 siblings, 0 replies; 55+ messages in thread
From: Thomas Gleixner @ 2015-06-05  9:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, wanpeng.li, linux-kernel

On Fri, 5 Jun 2015, Peter Zijlstra wrote:
>  /*
> + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
> + * such that hrtimer_callback_running() can unconditionally dereference
> + * timer->base->cpu_base
> + */
> +static struct hrtimer_cpu_base migration_cpu_base = {
> +	.seq = SEQCNT_ZERO(migration_cpu_base),
> +};
> +
> +static struct hrtimer_clock_base migration_base = {
> +	.cpu_base = &migration_cpu_base,
> +};

You can spare that extra migration clock base, because
migration_cpu_base already has clock bases inside.

static struct hrtimer_cpu_base migration_cpu_base = {
	.seq = SEQCNT_ZERO(migration_cpu_base),
};

and do:

    migration_cpu_base.clock_base[CLOCK_MONOTONIC].cpu_base = &migration_cpu_base;

in hrtimer_init.

So migration base becomes: 

#define migration_base	   &migration_cpu_base.clock_base[CLOCK_MONOTONIC]

That's also more efficient in terms of cache because is two adjacent
cache lines instead of two distant ones.

Other than that, this looks good.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 13/14] lockdep: Implement lock pinning
  2015-06-05  8:48 ` [PATCH 13/14] lockdep: Implement lock pinning Peter Zijlstra
@ 2015-06-05  9:55   ` Ingo Molnar
  2015-06-11 11:37     ` Peter Zijlstra
  0 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2015-06-05  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> RFC: a possible alternative API would be something like:
> 
>   int cookie = lockdep_pin_lock(&foo);
>   ...
>   lockdep_unpin_lock(&foo, cookie);

Yeah, this would be even nicer.

> Where we pick a random number for the pin_count; this makes it
> impossible to sneak a lock break in without also passing the right
> cookie along.
> 
> I've not done this because it ends up generating code for !LOCKDEP,
> esp. if you need to pass the cookie around for some reason.

The cookie could be a zero-size structure, which can be 'passed around' 
syntactically but creates no overhead in the code.

But I'd expect cookie-passing to be a sign of badness in most cases: the lock 
should generally be unpinned at the same level of abstraction...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 14/14] sched,lockdep: Employ lock pinning
  2015-06-05  8:48 ` [PATCH 14/14] sched,lockdep: Employ " Peter Zijlstra
@ 2015-06-05  9:57   ` Ingo Molnar
  2015-06-05 11:03     ` Peter Zijlstra
  0 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2015-06-05  9:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> Employ the new lockdep lock pinning annotation to ensure no
> 'accidental' lock-breaks happen with rq->lock.

btw., could we perhaps reorder this series a bit and first see that it properly 
detects the original locking bug, and then only fix the bug?

To make sure the pinning infrastructure works and all that.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 14/14] sched,lockdep: Employ lock pinning
  2015-06-05  9:57   ` Ingo Molnar
@ 2015-06-05 11:03     ` Peter Zijlstra
  2015-06-05 11:24       ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-05 11:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Fri, 2015-06-05 at 11:57 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Employ the new lockdep lock pinning annotation to ensure no
> > 'accidental' lock-breaks happen with rq->lock.
> 
> btw., could we perhaps reorder this series a bit and first see that it properly 
> detects the original locking bug, and then only fix the bug?
> 
> To make sure the pinning infrastructure works and all that.

It works; I've had plenty explosions while trying to get some of the
less obvious code paths sorted. I've managed to hit every single WARN I
added ;-)

Also, for bisection it might be annoying to hit known bad points, which
is why its ordered this way.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 14/14] sched,lockdep: Employ lock pinning
  2015-06-05 11:03     ` Peter Zijlstra
@ 2015-06-05 11:24       ` Ingo Molnar
  0 siblings, 0 replies; 55+ messages in thread
From: Ingo Molnar @ 2015-06-05 11:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2015-06-05 at 11:57 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > Employ the new lockdep lock pinning annotation to ensure no
> > > 'accidental' lock-breaks happen with rq->lock.
> > 
> > btw., could we perhaps reorder this series a bit and first see that it 
> > properly detects the original locking bug, and then only fix the bug?
> > 
> > To make sure the pinning infrastructure works and all that.
> 
> It works; I've had plenty explosions while trying to get some of the less 
> obvious code paths sorted. I've managed to hit every single WARN I added ;-)
> 
> Also, for bisection it might be annoying to hit known bad points, which is why 
> its ordered this way.

ok! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
  2015-06-05  9:48   ` Thomas Gleixner
@ 2015-06-07 19:43   ` Oleg Nesterov
  2015-06-07 22:33   ` Oleg Nesterov
  2 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-07 19:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/05, Peter Zijlstra wrote:
>
>  #define HRTIMER_STATE_INACTIVE	0x00
>  #define HRTIMER_STATE_ENQUEUED	0x01
> -#define HRTIMER_STATE_CALLBACK	0x02
> -#define HRTIMER_STATE_MIGRATE	0x04
> +#define HRTIMER_STATE_MIGRATE	0x02

Slightly pff-topic, but I do not understand HRTIMER_STATE_MIGRATE
with or without this change. Unless I am totally confused it looks
buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but
this is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Oleg.


--- x/include/linux/hrtimer.h
+++ x/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
--- x/kernel/time/hrtimer.c
+++ x/kernel/time/hrtimer.c
@@ -1642,11 +1642,11 @@ static void migrate_hrtimer_list(struct 
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1657,9 +1657,6 @@ static void migrate_hrtimer_list(struct 
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
  2015-06-05  9:48   ` Thomas Gleixner
  2015-06-07 19:43   ` Oleg Nesterov
@ 2015-06-07 22:33   ` Oleg Nesterov
  2015-06-07 22:56     ` Oleg Nesterov
  2015-06-08  9:14     ` Peter Zijlstra
  2 siblings, 2 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-07 22:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

Not sure I read this patch correctly, it doesn't apply to Linus's tree.

And I simply can not understand the complication in hrtimer_active(),
please help!

On 06/05, Peter Zijlstra wrote:
>
> +bool hrtimer_active(const struct hrtimer *timer)
> +{
> +	struct hrtimer_cpu_base *cpu_base;
> +	unsigned int seq;
> +	bool active;
> +
> +	do {
> +		active = false;
> +		cpu_base = READ_ONCE(timer->base->cpu_base);
> +		seq = raw_read_seqcount(&cpu_base->seq);
> +
> +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> +		    cpu_base->running == timer)
> +			active = true;

Why we can't simply return true in this case?

Unless you lock this timer, hrtimer_active() is inherently racy anyway.
Granted, it must not wrongly return False if the timer is pending or
running.

But "false positive" does not differ from the case when (say) the
running timer->function() finishes right after hrtimer_active() returns
True.

> +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> +		 cpu_base != READ_ONCE(timer->base->cpu_base));

Why do we need to re-check >cpu_base?

I think we can ignore migrate_hrtimer_list(), it doesn't clear ->state.

Otherwise the timer can change its ->base only if it is not running and
inactive, and again I think we should only eliminate the false negative
return.

And I think there is a problem. Consider a timer TIMER which always
rearms itself using some "default" timeout.

In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
hrtimer_active(&TIMER) == T. By definition, and currently this is
true.

After this patch this is no longer true (afaics). If the timer is
pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
will clear ENQUEUED first, then set it again in enqueue_hrtimer().

This means that hrtimer_active() returns false in between. And note
that it doesn't matter if the timer changes its ->base or not, so
that 2nd cpu_base above can't help.

I think that __hrtimer_start_range_ns() should preserve ENQUEUED
like migrate_hrtimer_list() should do (see the previous email).


Finally. Suppose that timer->function() returns HRTIMER_RESTART
and hrtimer_active() is called right after __run_hrtimer() sets
cpu_base->running = NULL. I can't understand why hrtimer_active()
can't miss ENQUEUED in this case. We have wmb() in between, yes,
but then hrtimer_active() should do something like

	active = cpu_base->running == timer;
	if (!active) {
		rmb();
		active = state != HRTIMER_STATE_INACTIVE;
	}

No?

But I am already sleeping and probably totally confused.

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-07 22:33   ` Oleg Nesterov
@ 2015-06-07 22:56     ` Oleg Nesterov
  2015-06-08  8:06       ` Thomas Gleixner
  2015-06-08  9:14     ` Peter Zijlstra
  1 sibling, 1 reply; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-07 22:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Oleg Nesterov wrote:
>
> And I simply can not understand the complication in hrtimer_active(),
> please help!

Sorry for another off-topic email, but I don't even understand the
usage of hrtimer_active().

Say, do_nanosleep()

		hrtimer_start_expires(&t->timer, mode);
		if (!hrtimer_active(&t->timer))
			t->task = NULL;

why? Assuming that hrtimer_active() is correct, it can only return
false if t->task was already cleared by hrtimer_wakeup().


OTOH. perf_cpu_hrtimer_restart() does

	if (hrtimer_active(hr))
		return;

	if (!hrtimer_callback_running(hr))
		__hrtimer_start_range_ns(...);

why it can't simply do

	if (!hrtimer_active(hr)) // implies !hrtimer_callback_running()
		__hrtimer_start_range_ns(...);


Confused.

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-07 22:56     ` Oleg Nesterov
@ 2015-06-08  8:06       ` Thomas Gleixner
  0 siblings, 0 replies; 55+ messages in thread
From: Thomas Gleixner @ 2015-06-08  8:06 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

On Mon, 8 Jun 2015, Oleg Nesterov wrote:

> On 06/08, Oleg Nesterov wrote:
> >
> > And I simply can not understand the complication in hrtimer_active(),
> > please help!
> 
> Sorry for another off-topic email, but I don't even understand the
> usage of hrtimer_active().
> 
> Say, do_nanosleep()
> 
> 		hrtimer_start_expires(&t->timer, mode);
> 		if (!hrtimer_active(&t->timer))
> 			t->task = NULL;

That's gone in tip/timers/core. 

> OTOH. perf_cpu_hrtimer_restart() does
> 
> 	if (hrtimer_active(hr))
> 		return;
> 
> 	if (!hrtimer_callback_running(hr))
> 		__hrtimer_start_range_ns(...);
> 
> why it can't simply do
> 
> 	if (!hrtimer_active(hr)) // implies !hrtimer_callback_running()
> 		__hrtimer_start_range_ns(...);

That's fixed as well.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-07 22:33   ` Oleg Nesterov
  2015-06-07 22:56     ` Oleg Nesterov
@ 2015-06-08  9:14     ` Peter Zijlstra
  2015-06-08 10:55       ` Peter Zijlstra
                         ` (3 more replies)
  1 sibling, 4 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08  9:14 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> Not sure I read this patch correctly, it doesn't apply to Linus's tree.

I was working on tip/master, there's a number of timer patches in there.

> And I simply can not understand the complication in hrtimer_active(),
> please help!
> 
> On 06/05, Peter Zijlstra wrote:
> >
> > +bool hrtimer_active(const struct hrtimer *timer)
> > +{
> > +	struct hrtimer_cpu_base *cpu_base;
> > +	unsigned int seq;
> > +	bool active;
> > +
> > +	do {
> > +		active = false;
> > +		cpu_base = READ_ONCE(timer->base->cpu_base);
> > +		seq = raw_read_seqcount(&cpu_base->seq);
> > +
> > +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> > +		    cpu_base->running == timer)
> > +			active = true;
> 
> Why we can't simply return true in this case?
> 
> Unless you lock this timer, hrtimer_active() is inherently racy anyway.
> Granted, it must not wrongly return False if the timer is pending or
> running.
> 
> But "false positive" does not differ from the case when (say) the
> running timer->function() finishes right after hrtimer_active() returns
> True.

So the problem with:

static inline bool hrtimer_active(const struct timer *timer)
{
	return timer->state != HRTIMER_STATE_INACTIVE ||
	       timer->base->cpu_base->running == timer;
}

Is that is can indeed return false falsely.

Consider __run_hrtimer:

	cpu_base->running = timer;
	__remove_hrtimer(..., HRTIMER_STATE_INACTIVE, ...);

Our test could observe INACTIVE but not yet see the ->running store. In
this case it would return false, even though it is in fact active.

> > +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > +		 cpu_base != READ_ONCE(timer->base->cpu_base));
> 
> Why do we need to re-check >cpu_base?

Because each CPU's cpu_base has independent seq numbers, so if the timer
migrates, the seq numbers might align just so that we fail to detect
change, even though there was -- extremely unlikely, but possible.

> I think we can ignore migrate_hrtimer_list(), it doesn't clear ->state.

I'm inclined to agree, although I did not get around to staring at that
on Friday and am currently in the process of waking up.

> Otherwise the timer can change its ->base only if it is not running and
> inactive, and again I think we should only eliminate the false negative
> return.

Agreed.

> And I think there is a problem. Consider a timer TIMER which always
> rearms itself using some "default" timeout.
> 
> In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> hrtimer_active(&TIMER) == T. By definition, and currently this is
> true.

I must ask for a little clarification; is this the simple:

	->function()
		hrtimer_forward_now(&self, timeout);
		return HRTIMER_RESTART;

Or something that 'randomly' calls:

	hrtimer_start() on a timer?

Because for the latter this isn't currently true for the same reason as
you give here:

> After this patch this is no longer true (afaics). If the timer is
> pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> will clear ENQUEUED first, then set it again in enqueue_hrtimer().

That is so even with the current code; the current code uses:

	hrtimer->state & CALLBACK

for __remove_hrtimer(.state). In the above case of a pending timer,
that's 0 aka. HRTIMER_STATE_INACTIVE.

> This means that hrtimer_active() returns false in between. And note
> that it doesn't matter if the timer changes its ->base or not, so
> that 2nd cpu_base above can't help.
> 
> I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> like migrate_hrtimer_list() should do (see the previous email).

I tend to agree, but I think its a pre-existing problem, not one
introduced by my proposed patch.

> Finally. Suppose that timer->function() returns HRTIMER_RESTART
> and hrtimer_active() is called right after __run_hrtimer() sets
> cpu_base->running = NULL. I can't understand why hrtimer_active()
> can't miss ENQUEUED in this case. We have wmb() in between, yes,
> but then hrtimer_active() should do something like
> 
> 	active = cpu_base->running == timer;
> 	if (!active) {
> 		rmb();
> 		active = state != HRTIMER_STATE_INACTIVE;
> 	}
> 
> No?

Hmm, good point. Let me think about that. It would be nice to be able to
avoid more memory barriers.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08  9:14     ` Peter Zijlstra
@ 2015-06-08 10:55       ` Peter Zijlstra
  2015-06-08 12:42       ` Peter Zijlstra
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 10:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> > Not sure I read this patch correctly, it doesn't apply to Linus's tree.
> 
> I was working on tip/master, there's a number of timer patches in there.
> 
> > And I simply can not understand the complication in hrtimer_active(),
> > please help!
> > 
> > On 06/05, Peter Zijlstra wrote:
> > >
> > > +bool hrtimer_active(const struct hrtimer *timer)
> > > +{
> > > +	struct hrtimer_cpu_base *cpu_base;
> > > +	unsigned int seq;
> > > +	bool active;
> > > +
> > > +	do {
> > > +		active = false;
> > > +		cpu_base = READ_ONCE(timer->base->cpu_base);
> > > +		seq = raw_read_seqcount(&cpu_base->seq);
> > > +
> > > +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> > > +		    cpu_base->running == timer)
> > > +			active = true;
> > 
> > Why we can't simply return true in this case?
> > 
> > Unless you lock this timer, hrtimer_active() is inherently racy anyway.
> > Granted, it must not wrongly return False if the timer is pending or
> > running.
> > 
> > But "false positive" does not differ from the case when (say) the
> > running timer->function() finishes right after hrtimer_active() returns
> > True.

OK I can't read; you asked why delay the return true inside that loop.

Yes we can as per your argument. I think I ended up being too paranoid
or something.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08  9:14     ` Peter Zijlstra
  2015-06-08 10:55       ` Peter Zijlstra
@ 2015-06-08 12:42       ` Peter Zijlstra
  2015-06-08 14:27         ` Oleg Nesterov
  2015-06-09 21:33         ` Oleg Nesterov
  2015-06-08 14:03       ` Oleg Nesterov
  2015-06-08 14:17       ` Peter Zijlstra
  3 siblings, 2 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 12:42 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > and hrtimer_active() is called right after __run_hrtimer() sets
> > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > but then hrtimer_active() should do something like
> > 
> > 	active = cpu_base->running == timer;
> > 	if (!active) {
> > 		rmb();
> > 		active = state != HRTIMER_STATE_INACTIVE;
> > 	}
> > 
> > No?
> 
> Hmm, good point. Let me think about that. It would be nice to be able to
> avoid more memory barriers.

So your scenario is:

				[R] seq
				  RMB
[S] ->state = ACTIVE
  WMB
[S] ->running = NULL
				[R] ->running (== NULL)
				[R] ->state (== INACTIVE; fail to observe
				             the ->state store due to
					     lack of order)
				  RMB
				[R] seq (== seq)
[S] seq++

Conversely, if we re-order the (first) seq++ store such that it comes
first:

[S] seq++

				[R] seq
				  RMB
				[R] ->running (== NULL)
[S] ->running = timer;
  WMB
[S] ->state = INACTIVE
				[R] ->state (== INACTIVE)
				  RMB
				[R] seq (== seq)

And we have another false negative.

And in this case we need the read order the other way around, we'd need:

	active = timer->state != HRTIMER_STATE_INACTIVE;
	if (!active) {
		smp_rmb();
		active = cpu_base->running == timer;
	}

Now I think we can fix this by either doing:

	WMB
	seq++
	WMB

On both sides of __run_hrtimer(), or do

bool hrtimer_active(const struct hrtimer *timer)
{
	struct hrtimer_cpu_base *cpu_base;
	unsigned int seq;

	do {
		cpu_base = READ_ONCE(timer->base->cpu_base);
		seq = raw_read_seqcount(&cpu_base->seq);

		if (timer->state != HRTIMER_STATE_INACTIVE)
			return true;

		smp_rmb();

		if (cpu_base->running == timer)
			return true;

		smp_rmb();

		if (timer->state != HRTIMER_STATE_INACTIVE)
			return true;

	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
		 cpu_base != READ_ONCE(timer->base->cpu_base));

	return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);


And since __run_hrtimer() is the more performance critical code, I think
it would be best to reduce the amount of memory barriers there.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08  9:14     ` Peter Zijlstra
  2015-06-08 10:55       ` Peter Zijlstra
  2015-06-08 12:42       ` Peter Zijlstra
@ 2015-06-08 14:03       ` Oleg Nesterov
  2015-06-08 14:17       ` Peter Zijlstra
  3 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 14:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
>
> static inline bool hrtimer_active(const struct timer *timer)
> {
> 	return timer->state != HRTIMER_STATE_INACTIVE ||
> 	       timer->base->cpu_base->running == timer;
> }
>
> Is that is can indeed return false falsely.

Yes sure, this is obviously buggy. But again, only because
"false falsely" is wrong. OK, I see other emails from you, lets
discuss this problem there.

> > > +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > > +		 cpu_base != READ_ONCE(timer->base->cpu_base));
> >
> > Why do we need to re-check >cpu_base?
>
> Because each CPU's cpu_base has independent seq numbers, so if the timer
> migrates, the seq numbers might align just so that we fail to detect
> change, even though there was -- extremely unlikely, but possible.

Of course. However. I am not sure, but it seems to me that a) seq numbers
can't really help exactly because we ran read the wrong base, and b) we can
rely on QUEUED check. I'll send another email, but see also below.

> > And I think there is a problem. Consider a timer TIMER which always
> > rearms itself using some "default" timeout.
> >
> > In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> > hrtimer_active(&TIMER) == T. By definition, and currently this is
> > true.
>
> I must ask for a little clarification; is this the simple:
>
> 	->function()
> 		hrtimer_forward_now(&self, timeout);
> 		return HRTIMER_RESTART;
>
> Or something that 'randomly' calls:
>
> 	hrtimer_start() on a timer?

The latter,

> Because for the latter this isn't currently true for the same reason as
> you give here:
>
> > After this patch this is no longer true (afaics). If the timer is
> > pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> > will clear ENQUEUED first, then set it again in enqueue_hrtimer().
>
> That is so even with the current code; the current code uses:
>
> 	hrtimer->state & CALLBACK

Ah, indeed, I misread this code. So the code is already buggy.

> > I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> > like migrate_hrtimer_list() should do (see the previous email).
>
> I tend to agree, but I think its a pre-existing problem, not one
> introduced by my proposed patch.

Yes. If we know that the timer always rearns itself then hrtimer_active()
must be always true. A "random" hrtimer_start() on this timer must not
create an "inactive" window.

So we should probably fix this in any case. And migrate_hrtimer_list()
too. And (it seems) this can help to simplify hrtimer_inactive().

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08  9:14     ` Peter Zijlstra
                         ` (2 preceding siblings ...)
  2015-06-08 14:03       ` Oleg Nesterov
@ 2015-06-08 14:17       ` Peter Zijlstra
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
  2015-06-08 15:10         ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
  3 siblings, 2 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 14:17 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> > And I think there is a problem. Consider a timer TIMER which always
> > rearms itself using some "default" timeout.
> > 
> > In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> > hrtimer_active(&TIMER) == T. By definition, and currently this is
> > true.
> >
> > After this patch this is no longer true (afaics). If the timer is
> > pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> > will clear ENQUEUED first, then set it again in enqueue_hrtimer().
> 
> That is so even with the current code; the current code uses:
> 
> 	hrtimer->state & CALLBACK
> 
> for __remove_hrtimer(.state). In the above case of a pending timer,
> that's 0 aka. HRTIMER_STATE_INACTIVE.
> 
> > This means that hrtimer_active() returns false in between. And note
> > that it doesn't matter if the timer changes its ->base or not, so
> > that 2nd cpu_base above can't help.
> > 
> > I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> > like migrate_hrtimer_list() should do (see the previous email).
> 
> I tend to agree, but I think its a pre-existing problem, not one
> introduced by my proposed patch.

Something like this would fix that I think. It fully preserves
timer->state over hrtimer_start_range_ns().

---
 kernel/time/hrtimer.c |   23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -891,10 +891,10 @@ static void __remove_hrtimer(struct hrti
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
+		unsigned long state = timer->state;
 		int reprogram;
 
 		/*
@@ -908,12 +908,15 @@ remove_hrtimer(struct hrtimer *timer, st
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
+
+		if (!restart) {
+			/*
+			 * We must preserve the CALLBACK state flag here,
+			 * otherwise we could move the timer base in
+			 * switch_hrtimer_base.
+			 */
+			state &= HRTIMER_STATE_CALLBACK;
+		}
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -938,7 +941,7 @@ void hrtimer_start_range_ns(struct hrtim
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base, true);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1007,7 +1010,7 @@ int hrtimer_try_to_cancel(struct hrtimer
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, false);
 
 	unlock_hrtimer_base(timer, &flags);
 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 12:42       ` Peter Zijlstra
@ 2015-06-08 14:27         ` Oleg Nesterov
  2015-06-08 14:42           ` Peter Zijlstra
  2015-06-08 15:10           ` Peter Zijlstra
  2015-06-09 21:33         ` Oleg Nesterov
  1 sibling, 2 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 14:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > but then hrtimer_active() should do something like
> > >
> > > 	active = cpu_base->running == timer;
> > > 	if (!active) {
> > > 		rmb();
> > > 		active = state != HRTIMER_STATE_INACTIVE;
> > > 	}
> > >
> > > No?
> >
> > Hmm, good point. Let me think about that. It would be nice to be able to
> > avoid more memory barriers.

Yes, but otoh, can't we avoid seqcount_t altogether?

To remind, we assume that

	- "false positive" is fine. If we observe ENQUEUED or ->running
	  we can safely return true. It doesn't matter if the timer becomes
	  "inactive" right after return.

	- we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
	  to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
	  and hrtimer_active() we currently have.

Now, can't we simply do

	__run_hrtimer()
	{

		cpu_base->running = timer;

		wmb();				// 1

		__remove_hrtimer(INACTIVE);	// clears ENQUEUED

		fn();				// autorearm can set ENQUEUED again

		wmb();				// 2

		cpu_base->running = NULL;	// XXX
	}

	hrtimer_active(timer)
	{
		if (timer->state & ENQUEUED)
			return true;

		rmb();				// pairs with 1


		// We do not care if we race with __hrtimer_start_range_ns().
		// The running timer can't change its base.
		// If it was ENQUEUED, we rely on the previous check.

		base = timer->base->cpu_base;
		read_barrier_depends();
		if (base->running == timer)
			return true;

		rmb();				// pairs with 2

		// Avoid the race with auto-rearming timer. If we see the
		// result of XXX above we should also see ENQUEUED if it
		// was set by __run_hrtimer() or timer->function().
		//
		// We do not care if another thread does hrtimer_start()
		// and we miss ENQUEUED. In this case we can the "inactive"
		// window anyway, we can pretend that hrtimer_start() was
		// called after XXX above. So we can equally pretend that
		// hrtimer_active() was called in this window.
		//
		if (timer->state & ENQUEUED)
			return true;

		return false;
	}

Most probably I missed something... I'll try to think more, but perhaps
you see a hole immediately?

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 14:27         ` Oleg Nesterov
@ 2015-06-08 14:42           ` Peter Zijlstra
  2015-06-08 15:49             ` Oleg Nesterov
  2015-06-08 15:10           ` Peter Zijlstra
  1 sibling, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 14:42 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> On 06/08, Peter Zijlstra wrote:
> >
> > On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > > but then hrtimer_active() should do something like
> > > >
> > > > 	active = cpu_base->running == timer;
> > > > 	if (!active) {
> > > > 		rmb();
> > > > 		active = state != HRTIMER_STATE_INACTIVE;
> > > > 	}
> > > >
> > > > No?
> > >
> > > Hmm, good point. Let me think about that. It would be nice to be able to
> > > avoid more memory barriers.
> 
> Yes, but otoh, can't we avoid seqcount_t altogether?
> 
> To remind, we assume that
> 
> 	- "false positive" is fine. If we observe ENQUEUED or ->running
> 	  we can safely return true. It doesn't matter if the timer becomes
> 	  "inactive" right after return.
> 
> 	- we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> 	  to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> 	  and hrtimer_active() we currently have.
> 
> Now, can't we simply do
> 
> 	__run_hrtimer()
> 	{
> 
> 		cpu_base->running = timer;
> 
> 		wmb();				// 1
> 
> 		__remove_hrtimer(INACTIVE);	// clears ENQUEUED
> 
> 		fn();				// autorearm can set ENQUEUED again
> 
> 		wmb();				// 2
> 
> 		cpu_base->running = NULL;	// XXX
> 	}
> 
> 	hrtimer_active(timer)
> 	{
> 		if (timer->state & ENQUEUED)
> 			return true;
> 
> 		rmb();				// pairs with 1
> 
> 
> 		// We do not care if we race with __hrtimer_start_range_ns().
> 		// The running timer can't change its base.
> 		// If it was ENQUEUED, we rely on the previous check.
> 
> 		base = timer->base->cpu_base;
> 		read_barrier_depends();
> 		if (base->running == timer)
> 			return true;
> 
> 		rmb();				// pairs with 2
> 
> 		// Avoid the race with auto-rearming timer. If we see the
> 		// result of XXX above we should also see ENQUEUED if it
> 		// was set by __run_hrtimer() or timer->function().
> 		//
> 		// We do not care if another thread does hrtimer_start()
> 		// and we miss ENQUEUED. In this case we can the "inactive"
> 		// window anyway, we can pretend that hrtimer_start() was
> 		// called after XXX above. So we can equally pretend that
> 		// hrtimer_active() was called in this window.
> 		//
> 		if (timer->state & ENQUEUED)
> 			return true;
> 
> 		return false;
> 	}
> 
> Most probably I missed something... I'll try to think more, but perhaps
> you see a hole immediately?

This is something I proposed earlier; Kirill said:

  lkml.kernel.org/r/2134411433408823@web8j.yandex.ru

Which I read like the below, imagine our timer expires periodically and
rearms itself:

 acquire
 cpu_base->running = timer;
 wmb
 timer->state = INACTIVE;
 release
				[R] timer->state (== INACTIVE)
 fn()
 acquire
 timer->state = ACTIVE
 wmb
 cpu_base->running = NULL
 release

				[R] cpu_base->running (== NULL)

 acquire
 cpu_base->running = timer;
 wmb
 timer->state = INACTIVE;
 release

				[R] timer->state (== INACTIVE)
 fn()
 acquire
 timer->state = ACTIVE
 wmb
 cpu_base->running = NULL
 release


And we have a false negative.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 14:27         ` Oleg Nesterov
  2015-06-08 14:42           ` Peter Zijlstra
@ 2015-06-08 15:10           ` Peter Zijlstra
  2015-06-08 15:16             ` Oleg Nesterov
  1 sibling, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 15:10 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> 	- we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> 	  to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> 	  and hrtimer_active() we currently have.

So I have your patch and the one I just send out; together they should
do this.

Can I add your SoB to your patch?

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 14:17       ` Peter Zijlstra
@ 2015-06-08 15:10         ` Oleg Nesterov
  2015-06-08 15:11           ` [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued Oleg Nesterov
                             ` (3 more replies)
  2015-06-08 15:10         ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
  1 sibling, 4 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> > I tend to agree, but I think its a pre-existing problem, not one
> > introduced by my proposed patch.
>
> Something like this would fix that I think. It fully preserves
> timer->state over hrtimer_start_range_ns().

Yes, but I think we can do a bit better.

Only for initial review, I need to re-check this...

And. I think that after you remove STATE_CALLBACK we can even kill
timer->state altogether, but this is another story.

What do you think?

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()
  2015-06-08 14:17       ` Peter Zijlstra
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
@ 2015-06-08 15:10         ` Oleg Nesterov
  2015-06-08 15:13           ` Oleg Nesterov
  1 sibling, 1 reply; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
race with hrtimer_active(), but this can't avoid the same race
with hrtimer_is_queued().

Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
---
 include/linux/hrtimer.h |    6 +-----
 kernel/time/hrtimer.c   |    7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1..44810bc 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd9..005fd44 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
@ 2015-06-08 15:11           ` Oleg Nesterov
  2015-06-08 15:11           ` [PATCH 3/3] hrtimer: fix the __hrtimer_start_range_ns() race with hrtimer_active() Oleg Nesterov
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

No functional changes. Change __remove_hrtimer() to accept the boolean
and change only the HRTIMER_STATE_ENQUEUED bit. This preserves
HRTIMER_STATE_CALLBACK (which we are going to kill) automatically, the
only complication is that __run_hrtimer() should set it by hand.
---
 kernel/time/hrtimer.c |   19 +++++++++----------
 1 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 005fd44..5fceb3d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,7 +871,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
  */
 static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
-			     unsigned long newstate, int reprogram)
+			     bool clear_enqueued, int reprogram)
 {
 	struct timerqueue_node *next_timer;
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
@@ -895,7 +895,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (!timerqueue_getnext(&base->active))
 		base->cpu_base->active_bases &= ~(1 << base->index);
 out:
-	timer->state = newstate;
+	if (clear_enqueued)
+		timer->state &= ~HRTIMER_STATE_ENQUEUED;
 }
 
 /*
@@ -905,7 +906,6 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
 		int reprogram;
 
 		/*
@@ -920,12 +920,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
+		 * This preserves the CALLBACK flag,  otherwise we could move
+		 * the timer base in switch_hrtimer_base.
 		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
-		__remove_hrtimer(timer, base, state, reprogram);
+		__remove_hrtimer(timer, base, true, reprogram);
 		return 1;
 	}
 	return 0;
@@ -1204,7 +1202,8 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	WARN_ON(!irqs_disabled());
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer->state |= HRTIMER_STATE_CALLBACK;
+	__remove_hrtimer(timer, base, true, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1644,7 +1643,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+		__remove_hrtimer(timer, old_base, false, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 3/3] hrtimer: fix the __hrtimer_start_range_ns() race with hrtimer_active()
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
  2015-06-08 15:11           ` [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued Oleg Nesterov
@ 2015-06-08 15:11           ` Oleg Nesterov
  2015-06-08 15:12           ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
  2015-06-08 15:35           ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Peter Zijlstra
  3 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

Change __hrtimer_start_range_ns() to preserve ENQUEUED.
---
 kernel/time/hrtimer.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5fceb3d..1761f96 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -902,8 +902,8 @@ out:
 /*
  * remove hrtimer, called with base lock held
  */
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+static inline int remove_hrtimer(struct hrtimer *timer,
+			struct hrtimer_clock_base *base, bool clear_enqueued)
 {
 	if (hrtimer_is_queued(timer)) {
 		int reprogram;
@@ -923,7 +923,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		 * This preserves the CALLBACK flag,  otherwise we could move
 		 * the timer base in switch_hrtimer_base.
 		 */
-		__remove_hrtimer(timer, base, true, reprogram);
+		__remove_hrtimer(timer, base, clear_enqueued, reprogram);
 		return 1;
 	}
 	return 0;
@@ -940,7 +940,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	ret = remove_hrtimer(timer, base);
+	ret = remove_hrtimer(timer, base, false);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1061,7 +1061,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, true);
 
 	unlock_hrtimer_base(timer, &flags);
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
  2015-06-08 15:11           ` [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued Oleg Nesterov
  2015-06-08 15:11           ` [PATCH 3/3] hrtimer: fix the __hrtimer_start_range_ns() race with hrtimer_active() Oleg Nesterov
@ 2015-06-08 15:12           ` Oleg Nesterov
  2015-06-08 15:35           ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Peter Zijlstra
  3 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
race with hrtimer_active(), but this can't avoid the same race
with hrtimer_is_queued().

Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
---
 include/linux/hrtimer.h |    6 +-----
 kernel/time/hrtimer.c   |    7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1..44810bc 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd9..005fd44 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Re: [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()
  2015-06-08 15:10         ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
@ 2015-06-08 15:13           ` Oleg Nesterov
  0 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

Sorry, for double/wrong posting. Please ignore this one, I've resent
it in reply to 0/3.

On 06/08, Oleg Nesterov wrote:
>
> migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
> race with hrtimer_active(), but this can't avoid the same race
> with hrtimer_is_queued().
> 
> Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
> ---
>  include/linux/hrtimer.h |    6 +-----
>  kernel/time/hrtimer.c   |    7 ++-----
>  2 files changed, 3 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
> index 05f6df1..44810bc 100644
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -70,17 +70,13 @@ enum hrtimer_restart {
>   * the handling of the timer.
>   *
>   * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
> - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
> - * also affects HRTIMER_STATE_MIGRATE where the preservation is not
> - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
> - * enqueued on the new cpu.
> + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
>   *
>   * All state transitions are protected by cpu_base->lock.
>   */
>  #define HRTIMER_STATE_INACTIVE	0x00
>  #define HRTIMER_STATE_ENQUEUED	0x01
>  #define HRTIMER_STATE_CALLBACK	0x02
> -#define HRTIMER_STATE_MIGRATE	0x04
>  
>  /**
>   * struct hrtimer - the basic hrtimer structure
> diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
> index 76d4bd9..005fd44 100644
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
>  		debug_deactivate(timer);
>  
>  		/*
> -		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
> +		 * Mark it as ENQUEUED not INACTIVE otherwise the
>  		 * timer could be seen as !active and just vanish away
>  		 * under us on another CPU
>  		 */
> -		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
> +		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
>  		timer->base = new_base;
>  		/*
>  		 * Enqueue the timers on the new cpu. This does not
> @@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
>  		 * event device.
>  		 */
>  		enqueue_hrtimer(timer, new_base);
> -
> -		/* Clear the migration state bit */
> -		timer->state &= ~HRTIMER_STATE_MIGRATE;
>  	}
>  }
>  
> -- 
> 1.5.5.1
> 


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 15:10           ` Peter Zijlstra
@ 2015-06-08 15:16             ` Oleg Nesterov
  0 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> > 	- we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> > 	  to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> > 	  and hrtimer_active() we currently have.
>
> So I have your patch and the one I just send out; together they should
> do this.
>
> Can I add your SoB to your patch?

Yes sure, thanks, but could you look at 1-3 I just sent?

1/3 is the same migrate_hrtimer_list() fix and it looks trivial. I need
to re-check 2-3.

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
                             ` (2 preceding siblings ...)
  2015-06-08 15:12           ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
@ 2015-06-08 15:35           ` Peter Zijlstra
  2015-06-08 15:56             ` Oleg Nesterov
  2015-06-08 17:11             ` Thomas Gleixner
  3 siblings, 2 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 15:35 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> On 06/08, Peter Zijlstra wrote:
> >
> > > I tend to agree, but I think its a pre-existing problem, not one
> > > introduced by my proposed patch.
> >
> > Something like this would fix that I think. It fully preserves
> > timer->state over hrtimer_start_range_ns().
> 
> Yes, but I think we can do a bit better.
> 
> Only for initial review, I need to re-check this...

I'm having a wee bit of bother spotting how you version is 'better'.

> And. I think that after you remove STATE_CALLBACK we can even kill
> timer->state altogether, but this is another story.

Ah, yes, we could introduce timerqueue_is_queued() which uses
RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 14:42           ` Peter Zijlstra
@ 2015-06-08 15:49             ` Oleg Nesterov
  0 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> >
> > Most probably I missed something... I'll try to think more, but perhaps
> > you see a hole immediately?
>
> This is something I proposed earlier; Kirill said:
>
>   lkml.kernel.org/r/2134411433408823@web8j.yandex.ru
>
> Which I read like the below, imagine our timer expires periodically and
> rearms itself:
>
>  acquire
>  cpu_base->running = timer;
>  wmb
>  timer->state = INACTIVE;
>  release
> 				[R] timer->state (== INACTIVE)
>  fn()
>  acquire
>  timer->state = ACTIVE
>  wmb
>  cpu_base->running = NULL
>  release
>
> 				[R] cpu_base->running (== NULL)
>
>  acquire
>  cpu_base->running = timer;
>  wmb
>  timer->state = INACTIVE;
>  release
>
> 				[R] timer->state (== INACTIVE)

Damn yes. Thanks Kirill and Peter.

And I swear, I swear I was thinking about this race yesterday but
forgot this problem today ;)

Yes it seems that we can't avoid the seq counter. But perhaps we
can increment/check it once in run_hrtimer/hrtimer_inactive ...
I'll try to think.

Thanks!

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 15:35           ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Peter Zijlstra
@ 2015-06-08 15:56             ` Oleg Nesterov
  2015-06-08 17:11             ` Thomas Gleixner
  1 sibling, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 15:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > On 06/08, Peter Zijlstra wrote:
> > >
> > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > introduced by my proposed patch.
> > >
> > > Something like this would fix that I think. It fully preserves
> > > timer->state over hrtimer_start_range_ns().
> >
> > Yes, but I think we can do a bit better.
> >
> > Only for initial review, I need to re-check this...
>
> I'm having a wee bit of bother spotting how you version is 'better'.
>
> > And. I think that after you remove STATE_CALLBACK we can even kill
> > timer->state altogether, but this is another story.
>
> Ah, yes, we could introduce timerqueue_is_queued() which uses
> RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.

Yes exactly.

And to me 2/3 looks like a cleanup in any case, __remove_hrtimer()
should do nothing with other bits. Yes,

	timer->state |= HRTIMER_STATE_CALLBACK;
	__remove_hrtimer(timer, base, true, 0);

in __run_hrtimer() looks worse than __remove_hrtimer(CALLBACK), but
you are going to kill STATE_CALLBACK. And this should even simplify
your patch a little bit.

But I agree, this is minor and subjective.

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 15:35           ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Peter Zijlstra
  2015-06-08 15:56             ` Oleg Nesterov
@ 2015-06-08 17:11             ` Thomas Gleixner
  2015-06-08 19:08               ` Peter Zijlstra
  2015-06-08 20:52               ` Oleg Nesterov
  1 sibling, 2 replies; 55+ messages in thread
From: Thomas Gleixner @ 2015-06-08 17:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

On Mon, 8 Jun 2015, Peter Zijlstra wrote:
> On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > On 06/08, Peter Zijlstra wrote:
> > >
> > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > introduced by my proposed patch.
> > >
> > > Something like this would fix that I think. It fully preserves
> > > timer->state over hrtimer_start_range_ns().
> > 
> > Yes, but I think we can do a bit better.
> > 
> > Only for initial review, I need to re-check this...
> 
> I'm having a wee bit of bother spotting how you version is 'better'.
> 
> > And. I think that after you remove STATE_CALLBACK we can even kill
> > timer->state altogether, but this is another story.
> 
> Ah, yes, we could introduce timerqueue_is_queued() which uses
> RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.

Which won't work for the migration case unless we have some trickery
like we do with double linked lists (not setting the prev member to
NULL on dequeue).

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 17:11             ` Thomas Gleixner
@ 2015-06-08 19:08               ` Peter Zijlstra
  2015-06-08 20:52               ` Oleg Nesterov
  1 sibling, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-08 19:08 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Oleg Nesterov, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

On Mon, 2015-06-08 at 19:11 +0200, Thomas Gleixner wrote:

> > Ah, yes, we could introduce timerqueue_is_queued() which uses
> > RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.
> 
> Which won't work for the migration case unless we have some trickery
> like we do with double linked lists (not setting the prev member to
> NULL on dequeue).

Yeah, that dawned on me while away from the computer.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes
  2015-06-08 17:11             ` Thomas Gleixner
  2015-06-08 19:08               ` Peter Zijlstra
@ 2015-06-08 20:52               ` Oleg Nesterov
  1 sibling, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-08 20:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Thomas Gleixner wrote:
>
> On Mon, 8 Jun 2015, Peter Zijlstra wrote:
> > On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > > On 06/08, Peter Zijlstra wrote:
> > > >
> > > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > > introduced by my proposed patch.
> > > >
> > > > Something like this would fix that I think. It fully preserves
> > > > timer->state over hrtimer_start_range_ns().
> > >
> > > Yes, but I think we can do a bit better.
> > >
> > > Only for initial review, I need to re-check this...
> >
> > I'm having a wee bit of bother spotting how you version is 'better'.
> >
> > > And. I think that after you remove STATE_CALLBACK we can even kill
> > > timer->state altogether, but this is another story.
> >
> > Ah, yes, we could introduce timerqueue_is_queued() which uses
> > RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.
>
> Which won't work for the migration case unless we have some trickery
> like we do with double linked lists (not setting the prev member to
> NULL on dequeue).

Of course, but this is trivial, no? Nevermind, I could easily miss
somthing and right now this is off-topic.

What do you think about this series? To me it makes sense in any case.
But I need (at least)to update the changelogs. In particular 3/3 doesn't
explain why do we need this change. If you missed the previous discussion,
this (hopefully) fixes the problem with the auto-rearming timers, the
"random" hrtimer_restart() wrongly creates a window when this timer looks
as !hrtimer_inactive().




Peter, I tried to think again about ->running and all I can say is that
I am totally confused ;) I'll try to write another email tomorrow.

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-08 12:42       ` Peter Zijlstra
  2015-06-08 14:27         ` Oleg Nesterov
@ 2015-06-09 21:33         ` Oleg Nesterov
  2015-06-09 21:39           ` Oleg Nesterov
                             ` (4 more replies)
  1 sibling, 5 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-09 21:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > but then hrtimer_active() should do something like
> > >
> > > 	active = cpu_base->running == timer;
> > > 	if (!active) {
> > > 		rmb();
> > > 		active = state != HRTIMER_STATE_INACTIVE;
> > > 	}
> > >
> > > No?
> >
> > Hmm, good point. Let me think about that. It would be nice to be able to
> > avoid more memory barriers.
>
> So your scenario is:
>
> 				[R] seq
> 				  RMB
> [S] ->state = ACTIVE
>   WMB
> [S] ->running = NULL
> 				[R] ->running (== NULL)
> 				[R] ->state (== INACTIVE; fail to observe
> 				             the ->state store due to
> 					     lack of order)
> 				  RMB
> 				[R] seq (== seq)
> [S] seq++
>
> Conversely, if we re-order the (first) seq++ store such that it comes
> first:
>
> [S] seq++
>
> 				[R] seq
> 				  RMB
> 				[R] ->running (== NULL)
> [S] ->running = timer;
>   WMB
> [S] ->state = INACTIVE
> 				[R] ->state (== INACTIVE)
> 				  RMB
> 				[R] seq (== seq)
>
> And we have another false negative.
>
> And in this case we need the read order the other way around, we'd need:
>
> 	active = timer->state != HRTIMER_STATE_INACTIVE;
> 	if (!active) {
> 		smp_rmb();
> 		active = cpu_base->running == timer;
> 	}
>
> Now I think we can fix this by either doing:
>
> 	WMB
> 	seq++
> 	WMB
>
> On both sides of __run_hrtimer(), or do
>
> bool hrtimer_active(const struct hrtimer *timer)
> {
> 	struct hrtimer_cpu_base *cpu_base;
> 	unsigned int seq;
>
> 	do {
> 		cpu_base = READ_ONCE(timer->base->cpu_base);
> 		seq = raw_read_seqcount(&cpu_base->seq);
>
> 		if (timer->state != HRTIMER_STATE_INACTIVE)
> 			return true;
>
> 		smp_rmb();
>
> 		if (cpu_base->running == timer)
> 			return true;
>
> 		smp_rmb();
>
> 		if (timer->state != HRTIMER_STATE_INACTIVE)
> 			return true;
>
> 	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> 		 cpu_base != READ_ONCE(timer->base->cpu_base));
>
> 	return false;
> }

You know, I simply can't convince myself I understand why this code
correct... or not.

But contrary to what I said before, I agree that we need to recheck
timer->base. This probably needs more discussion, to me it is very
unobvious why we can trust this cpu_base != READ_ONCE() check. Yes,
we have a lot of barriers, but they do not pair with each other. Lets
ignore this for now.

> And since __run_hrtimer() is the more performance critical code, I think
> it would be best to reduce the amount of memory barriers there.

Yes, but wmb() is cheap on x86... Perhaps we can make this code
"obviously correct" ?


How about the following..... We add cpu_base->seq as before but
limit its "write" scope so that we cam use the regular read/retry.

So,

	hrtimer_active(timer)
	{

		do {
			base = READ_ONCE(timer->base->cpu_base);
			seq = read_seqcount_begin(&cpu_base->seq);

			if (timer->state & ENQUEUED ||
			    base->running == timer)
				return true;

		} while (read_seqcount_retry(&cpu_base->seq, seq) ||
			 base != READ_ONCE(timer->base->cpu_base));

		return false;
	}

And we need to avoid the races with 2 transitions in __run_hrtimer().

The first race is trivial, we change __run_hrtimer() to do

	write_seqcount_begin(cpu_base->seq);
	cpu_base->running = timer;
	__remove_hrtimer(timer);	// clears ENQUEUED
	write_seqcount_end(cpu_base->seq);

and hrtimer_active() obviously can't race with this section.

Then we change enqueue_hrtimer()


	+	bool need_lock = base->cpu_base->running == timer;
	+	if (need_lock)
	+		write_seqcount_begin(cpu_base->seq);
	+
		timer->state |= HRTIMER_STATE_ENQUEUED;
	+
	+	if (need_lock)
	+		write_seqcount_end(cpu_base->seq);


Now. If the timer is re-queued by the time __run_hrtimer() clears
->running we have the following sequence:

	write_seqcount_begin(cpu_base->seq);
	timer->state |= HRTIMER_STATE_ENQUEUED;
	write_seqcount_end(cpu_base->seq);

	base->running = NULL;

and I think this should equally work, because in this case we do not
care if hrtimer_active() misses "running = NULL".

Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
arms itself, but otherwise we do not race. If another thread does
hrtime_start() in between we can pretend that hrtimer_active() hits
the "inactive".

What do you think?


And. Note that we can rewrite these 2 "write" critical sections in
__run_hrtimer() and enqueue_hrtimer() as

	cpu_base->running = timer;

	write_seqcount_begin(cpu_base->seq);
	write_seqcount_end(cpu_base->seq);

	__remove_hrtimer(timer);

and

	timer->state |= HRTIMER_STATE_ENQUEUED;

	write_seqcount_begin(cpu_base->seq);
	write_seqcount_end(cpu_base->seq);

	base->running = NULL;

So we can probably use write_seqcount_barrier() except I am not sure
about the 2nd wmb...

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-09 21:33         ` Oleg Nesterov
@ 2015-06-09 21:39           ` Oleg Nesterov
  2015-06-10  6:55           ` Peter Zijlstra
                             ` (3 subsequent siblings)
  4 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-09 21:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

Damn,

On 06/09, Oleg Nesterov wrote:
>
> Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
> arms itself, but otherwise we do not race. If another thread does
> hrtime_start() in between we can pretend that hrtimer_active() hits
> the "inactive".
  ^^^^^^^^^^^^^^^

I meant,

	hits the "inactive" window we can't avoid. Even if another
	thread does this _before_ __run_hrtimer() clears ->running,
	we can't distinguish from the case when it does this right
	_after_ the timer becomes "inactive".

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-09 21:33         ` Oleg Nesterov
  2015-06-09 21:39           ` Oleg Nesterov
@ 2015-06-10  6:55           ` Peter Zijlstra
  2015-06-10  7:46           ` Kirill Tkhai
                             ` (2 subsequent siblings)
  4 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-10  6:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Tue, 2015-06-09 at 23:33 +0200, Oleg Nesterov wrote:
> 
> Yes, but wmb() is cheap on x86... Perhaps we can make this code
> "obviously correct" ?

I'll reply to the rest a bit later; got to run some errands first.

The 'problem' of course is ARM/PPC etc.. we would like to keep this
generic code fast, even for the loosely ordered machines.

And wmb() is certainly not cheap for them; although not as bad as a full
barrier.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-09 21:33         ` Oleg Nesterov
  2015-06-09 21:39           ` Oleg Nesterov
  2015-06-10  6:55           ` Peter Zijlstra
@ 2015-06-10  7:46           ` Kirill Tkhai
  2015-06-10 16:04             ` Oleg Nesterov
  2015-06-10 15:49           ` Oleg Nesterov
  2015-06-10 22:37           ` Peter Zijlstra
  4 siblings, 1 reply; 55+ messages in thread
From: Kirill Tkhai @ 2015-06-10  7:46 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

Hi, Oleg,

В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> On 06/08, Peter Zijlstra wrote:
> >
> > On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > > but then hrtimer_active() should do something like
> > > >
> > > > 	active = cpu_base->running == timer;
> > > > 	if (!active) {
> > > > 		rmb();
> > > > 		active = state != HRTIMER_STATE_INACTIVE;
> > > > 	}
> > > >
> > > > No?
> > >
> > > Hmm, good point. Let me think about that. It would be nice to be able to
> > > avoid more memory barriers.
> >
> > So your scenario is:
> >
> > 				[R] seq
> > 				  RMB
> > [S] ->state = ACTIVE
> >   WMB
> > [S] ->running = NULL
> > 				[R] ->running (== NULL)
> > 				[R] ->state (== INACTIVE; fail to observe
> > 				             the ->state store due to
> > 					     lack of order)
> > 				  RMB
> > 				[R] seq (== seq)
> > [S] seq++
> >
> > Conversely, if we re-order the (first) seq++ store such that it comes
> > first:
> >
> > [S] seq++
> >
> > 				[R] seq
> > 				  RMB
> > 				[R] ->running (== NULL)
> > [S] ->running = timer;
> >   WMB
> > [S] ->state = INACTIVE
> > 				[R] ->state (== INACTIVE)
> > 				  RMB
> > 				[R] seq (== seq)
> >
> > And we have another false negative.
> >
> > And in this case we need the read order the other way around, we'd need:
> >
> > 	active = timer->state != HRTIMER_STATE_INACTIVE;
> > 	if (!active) {
> > 		smp_rmb();
> > 		active = cpu_base->running == timer;
> > 	}
> >
> > Now I think we can fix this by either doing:
> >
> > 	WMB
> > 	seq++
> > 	WMB
> >
> > On both sides of __run_hrtimer(), or do
> >
> > bool hrtimer_active(const struct hrtimer *timer)
> > {
> > 	struct hrtimer_cpu_base *cpu_base;
> > 	unsigned int seq;
> >
> > 	do {
> > 		cpu_base = READ_ONCE(timer->base->cpu_base);
> > 		seq = raw_read_seqcount(&cpu_base->seq);
> >
> > 		if (timer->state != HRTIMER_STATE_INACTIVE)
> > 			return true;
> >
> > 		smp_rmb();
> >
> > 		if (cpu_base->running == timer)
> > 			return true;
> >
> > 		smp_rmb();
> >
> > 		if (timer->state != HRTIMER_STATE_INACTIVE)
> > 			return true;
> >
> > 	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > 		 cpu_base != READ_ONCE(timer->base->cpu_base));
> >
> > 	return false;
> > }
> 
> You know, I simply can't convince myself I understand why this code
> correct... or not.
> 
> But contrary to what I said before, I agree that we need to recheck
> timer->base. This probably needs more discussion, to me it is very
> unobvious why we can trust this cpu_base != READ_ONCE() check. Yes,
> we have a lot of barriers, but they do not pair with each other. Lets
> ignore this for now.
> 
> > And since __run_hrtimer() is the more performance critical code, I think
> > it would be best to reduce the amount of memory barriers there.
> 
> Yes, but wmb() is cheap on x86... Perhaps we can make this code
> "obviously correct" ?
> 
> 
> How about the following..... We add cpu_base->seq as before but
> limit its "write" scope so that we cam use the regular read/retry.
> 
> So,
> 
> 	hrtimer_active(timer)
> 	{
> 
> 		do {
> 			base = READ_ONCE(timer->base->cpu_base);
> 			seq = read_seqcount_begin(&cpu_base->seq);
> 
> 			if (timer->state & ENQUEUED ||
> 			    base->running == timer)
> 				return true;
> 
> 		} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> 			 base != READ_ONCE(timer->base->cpu_base));
> 
> 		return false;
> 	}
> 
> And we need to avoid the races with 2 transitions in __run_hrtimer().
> 
> The first race is trivial, we change __run_hrtimer() to do
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	cpu_base->running = timer;
> 	__remove_hrtimer(timer);	// clears ENQUEUED
> 	write_seqcount_end(cpu_base->seq);

We use seqcount, because we are afraid that hrtimer_active() may miss
timer->state or cpu_base->running, when we are clearing it.

If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
we may protect only the places where we do that:

	cpu_base->running = timer;
	write_seqcount_begin(cpu_base->seq);
	__remove_hrtimer(timer);	// clears ENQUEUED
	write_seqcount_end(cpu_base->seq);

	....

	timer->state |= HRTIMER_STATE_ENQUEUED;
	write_seqcount_begin(cpu_base->seq);
	base->running = NULL;
	write_seqcount_end(cpu_base->seq);

> 
> and hrtimer_active() obviously can't race with this section.
> 
> Then we change enqueue_hrtimer()
> 
> 
> 	+	bool need_lock = base->cpu_base->running == timer;
> 	+	if (need_lock)
> 	+		write_seqcount_begin(cpu_base->seq);
> 	+
> 		timer->state |= HRTIMER_STATE_ENQUEUED;
> 	+
> 	+	if (need_lock)
> 	+		write_seqcount_end(cpu_base->seq);
> 
> 
> Now. If the timer is re-queued by the time __run_hrtimer() clears
> ->running we have the following sequence:
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	timer->state |= HRTIMER_STATE_ENQUEUED;
> 	write_seqcount_end(cpu_base->seq);
> 
> 	base->running = NULL;
> 
> and I think this should equally work, because in this case we do not
> care if hrtimer_active() misses "running = NULL".
> 
> Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
> arms itself, but otherwise we do not race. If another thread does
> hrtime_start() in between we can pretend that hrtimer_active() hits
> the "inactive".
> 
> What do you think?
> 
> 
> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
> 
> 	cpu_base->running = timer;
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
> 
> 	__remove_hrtimer(timer);
> 
> and
> 
> 	timer->state |= HRTIMER_STATE_ENQUEUED;
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
> 
> 	base->running = NULL;
> 
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-09 21:33         ` Oleg Nesterov
                             ` (2 preceding siblings ...)
  2015-06-10  7:46           ` Kirill Tkhai
@ 2015-06-10 15:49           ` Oleg Nesterov
  2015-06-10 22:37           ` Peter Zijlstra
  4 siblings, 0 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-10 15:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

Forgot to mention...

On 06/09, Oleg Nesterov wrote:
>
> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
>
> 	cpu_base->running = timer;
>
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
>
> 	__remove_hrtimer(timer);
>
> and
>
> 	timer->state |= HRTIMER_STATE_ENQUEUED;
>
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
>
> 	base->running = NULL;
>
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...

Or we can change hrtimer_active() to use raw_read_seqcount() likes your
patch does (avoid odd/even games) and then __run_hrtimer() can use
raw_write_seqcount_latch(), it already has wmb's on both sides.

I am not even sure we need the "if (base->cpu_base->running == timer)"
optimization for the 2nd _latch(), we cando this unconditionally in
__run_hrtimer().

And in this case the code will look very much like your patch, but
imo at the same it will look much more understandable. Because, again,
this way we just add 2 critical "write" sections (barriers) into
__run_hrtimer(), no need to explain the usage of memory barriers, etc,
we can simply rely on acquire/release semantics of seqcount_t.

But yes. This adds 2 additional wmb's into run_hrtimer(). Again, we
can make the 2nd _latch() condtitional, we only need it if the timer
requeues itself, but I am not sure.



Finally. You know, I am afraid very much I confused myself completely
and now I am trying to confuse you and Kirill ;)

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-10  7:46           ` Kirill Tkhai
@ 2015-06-10 16:04             ` Oleg Nesterov
  2015-06-11  7:31               ` Peter Zijlstra
  2015-06-11 16:25               ` Kirill Tkhai
  0 siblings, 2 replies; 55+ messages in thread
From: Oleg Nesterov @ 2015-06-10 16:04 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

Hi Kirill,

On 06/10, Kirill Tkhai wrote:
>
> В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> >
> > 	hrtimer_active(timer)
> > 	{
> >
> > 		do {
> > 			base = READ_ONCE(timer->base->cpu_base);
> > 			seq = read_seqcount_begin(&cpu_base->seq);
> >
> > 			if (timer->state & ENQUEUED ||
> > 			    base->running == timer)
> > 				return true;
> >
> > 		} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > 			 base != READ_ONCE(timer->base->cpu_base));
> >
> > 		return false;
> > 	}
> >
> > And we need to avoid the races with 2 transitions in __run_hrtimer().
> >
> > The first race is trivial, we change __run_hrtimer() to do
> >
> > 	write_seqcount_begin(cpu_base->seq);
> > 	cpu_base->running = timer;
> > 	__remove_hrtimer(timer);	// clears ENQUEUED
> > 	write_seqcount_end(cpu_base->seq);
>
> We use seqcount, because we are afraid that hrtimer_active() may miss
> timer->state or cpu_base->running, when we are clearing it.

Yes,

> If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
> we may protect only the places where we do that:
>
> 	cpu_base->running = timer;
> 	write_seqcount_begin(cpu_base->seq);
> 	__remove_hrtimer(timer);	// clears ENQUEUED
> 	write_seqcount_end(cpu_base->seq);
>
> 	....
>
> 	timer->state |= HRTIMER_STATE_ENQUEUED;
> 	write_seqcount_begin(cpu_base->seq);
> 	base->running = NULL;
> 	write_seqcount_end(cpu_base->seq);

Afaics, no. Afaics, the following code is correct:

	seqcount_t LOCK;
	bool X = true, Y = false;

	void read(void)
	{
		bool x, y;

		do {
			seq = read_seqcount_begin(&LOCK);

			x = X; y = Y;

		} while (read_seqcount_retry(&LOCK, seq));

		BUG_ON(!x && !y);
	}

	void write(void)
	{
		Y = true;

		write_seqcount_begin(LOCK);
		write_seqcount_end(LOCK);

		X = false;
	}

If we rely on the "locking" semantics of seqcount_t, this doesn't really
differ from

	spinlock_t LOCK;
	bool X = true, Y = false;

	void read(void)
	{
		bool x, y;

		spin_lock(LOCK);
		x = X; y = Y;
		spin_unlock(LOCK);

		BUG_ON(!x && !y);
	}

	void write(void)
	{
		Y = true;

		spin_lock(LOCK);
		spin_unlock(LOCK);

		X = false;
	}

If "read" takes the lock before "write", it must see X == true.

Otherwise "read" should see all memory changes done before or
inside the "write" critical section, so it must see Y == true.

No?

Oleg.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-09 21:33         ` Oleg Nesterov
                             ` (3 preceding siblings ...)
  2015-06-10 15:49           ` Oleg Nesterov
@ 2015-06-10 22:37           ` Peter Zijlstra
  4 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-10 22:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, wanpeng.li, linux-kernel

On Tue, Jun 09, 2015 at 11:33:18PM +0200, Oleg Nesterov wrote:

> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
> 
> 	cpu_base->running = timer;
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
> 
> 	__remove_hrtimer(timer);
> 
> and
> 
> 	timer->state |= HRTIMER_STATE_ENQUEUED;
> 
> 	write_seqcount_begin(cpu_base->seq);
> 	write_seqcount_end(cpu_base->seq);
> 
> 	base->running = NULL;
> 
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...

Which second wmb?

In any case, you use that transform from your reply to Kirill, and I
cannot currently see a hole in that. Lets call this transformation A. It
gets us the quoted bit above.

Now the above is:

	seq++;
	smp_wmb();
	smp_wmb();
	seq++;

Now, double barriers are pointless, so I think we can all agree that the
above is identical to the below. Lets call this tranformation B.

	seq++;
	smp_wmb();
	seq++;

And then because you use the traditional seqcount read side, which
stalls when seq&1, we can transform the above into this. Transformation
C.

	smp_wmb();
	seq += 2;

Which is write_seqcount_barrier(), as you say above.

And since there are no odd numbers possible in that scheme, its
identical to my modified read side with the single increment. Transform
D.

The only difference at this point is that I have my seq increment on the
'wrong' side on the first state.

	cpu_base->running = timer;

	seq++;
	smp_wmb();

	timer->state = 0;

	...

	timer->state = 1;

	smp_wmb();
	seq++;

	cpu_base->running = NULL;

Which, per my previous mail provides the following:

[S] seq++
                                [R] seq
                                    RMB
                                [R] ->running (== NULL)
[S] ->running = timer;
    WMB
[S] ->state = INACTIVE
                                [R] ->state (== INACTIVE)
                                    RMB
                                [R] seq (== seq)

Which is where we had to modify the read side to do:

		[R] ->state
		    RMB
		[R] ->running

Now, if we use write_seqcount_barrier() that would become:

	__run_hrtimer()				hrtimer_active()

	[S] ->running = timer;			[R] seq
	    WMB					    RMB
	[S] seq += 2;				[R] ->running
	[S] ->state = 0;			[R] ->state
						    RMB
						[R] seq


Which we can reorder like:

						[R] seq
						    RMB
						[R] ->running (== NULL)
	[S] ->running = timer
	    WMB
	[S] ->state = 0
						[R] ->state (== 0)
						   RMB
						[R] seq (== seq)
	[S] seq += 2


Which still gives us that false negative and would still require the
read side to be modified to do:

		[R] ->state
		    RMB
		[R] ->running

IOW, one of our transforms (A-D) is faulty for it requires a
modification to the read side.

I suspect its T-C, where we loose the odd count that holds up the read
side.

Because the moment we go from:

	Y = true;
	seq++
	WMB
	seq++
	X = false;

to:

	Y = true;
	WMB
	seq += 2;
	X = false;

It becomes possible to re-order like:

	Y = true;
	WMB
	X = false

	seq += 2;

And we loose our read order; or rather, where previously we ordered the
read side by seq, the seq increments are no longer ordered.

With this I think we can prove my code correct, however it also suggests
that:

	cpu_base->running = timer;
	seq++;
	smp_wmb();
	seq++;
	timer->state = 0;

	...

	timer->state = 1;
	seq++;
	smp_wmb();
	seq++;
	cpu_base->running = NULL;

vs
	hrtimer_active(timer)
        {

                do {
                        base = READ_ONCE(timer->base->cpu_base);
                        seq = read_seqcount_begin(&cpu_base->seq);

                        if (timer->state & ENQUEUED ||
                            base->running == timer)
                                return true;

                } while (read_seqcount_retry(&cpu_base->seq, seq) ||
                         base != READ_ONCE(timer->base->cpu_base));

                return false;
        }

Is the all-round cheapest solution. Those extra seq increments are
almost free on all archs as the cacheline will be hot and modified on
the local cpu.

Only under the very rare condition of a concurrent hrtimer_active() call
will that seq line be pulled into shared state.


I shall go sleep now, and update my patch tomorrow, lets see if I will
still agree with myself after a sleep :-)

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-10 16:04             ` Oleg Nesterov
@ 2015-06-11  7:31               ` Peter Zijlstra
  2015-06-11 16:25               ` Kirill Tkhai
  1 sibling, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-11  7:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Kirill Tkhai, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

On Wed, Jun 10, 2015 at 06:04:44PM +0200, Oleg Nesterov wrote:
> If we rely on the "locking" semantics of seqcount_t, this doesn't really
> differ from
> 
> 	spinlock_t LOCK;
> 	bool X = true, Y = false;
> 
> 	void read(void)
> 	{
> 		bool x, y;
> 
> 		spin_lock(LOCK);
> 		x = X; y = Y;
> 		spin_unlock(LOCK);
> 
> 		BUG_ON(!x && !y);
> 	}
> 
> 	void write(void)
> 	{
> 		Y = true;
> 
> 		spin_lock(LOCK);
> 		spin_unlock(LOCK);
> 
> 		X = false;
> 	}

So when I first saw that I went, wait what?

Because I looked at it like:

	Y = true;
	ACQUIRE
	RELEASE
	X = false;

And we both know that is not ordered at all.

But its the actual lock semantics that make it work, we cannot acquire
until the other completes, at which time the acquire matches its
release and things end up ordered again.

And its exactly that part we lost in our transformation C yesterday.

In any case, breakfast time, will go do the patches shortly.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 13/14] lockdep: Implement lock pinning
  2015-06-05  9:55   ` Ingo Molnar
@ 2015-06-11 11:37     ` Peter Zijlstra
  0 siblings, 0 replies; 55+ messages in thread
From: Peter Zijlstra @ 2015-06-11 11:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Fri, Jun 05, 2015 at 11:55:52AM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > RFC: a possible alternative API would be something like:
> > 
> >   int cookie = lockdep_pin_lock(&foo);
> >   ...
> >   lockdep_unpin_lock(&foo, cookie);
> 
> Yeah, this would be even nicer.
> 
> > Where we pick a random number for the pin_count; this makes it
> > impossible to sneak a lock break in without also passing the right
> > cookie along.
> > 
> > I've not done this because it ends up generating code for !LOCKDEP,
> > esp. if you need to pass the cookie around for some reason.
> 
> The cookie could be a zero-size structure, which can be 'passed around' 
> syntactically but creates no overhead in the code.
> 
> But I'd expect cookie-passing to be a sign of badness in most cases: the lock 
> should generally be unpinned at the same level of abstraction...

I have tried to make this work, but so far I've failed at making the
!LOCKDEP case generate 'similar' code.

Esp, things like:

	rq = task_rq_lock(p, flags);

	...

	task_rq_unlock(rq, p, flags);

Need to somehow pass the cookie, and all pure stack based approaches
I've tried ended up being ugly and generating weird code.

So I'll keep the non-cookie approach for now.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-10 16:04             ` Oleg Nesterov
  2015-06-11  7:31               ` Peter Zijlstra
@ 2015-06-11 16:25               ` Kirill Tkhai
  1 sibling, 0 replies; 55+ messages in thread
From: Kirill Tkhai @ 2015-06-11 16:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel

В Ср, 10/06/2015 в 18:04 +0200, Oleg Nesterov пишет:
> Hi Kirill,
> 
> On 06/10, Kirill Tkhai wrote:
> >
> > В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> > >
> > > 	hrtimer_active(timer)
> > > 	{
> > >
> > > 		do {
> > > 			base = READ_ONCE(timer->base->cpu_base);
> > > 			seq = read_seqcount_begin(&cpu_base->seq);
> > >
> > > 			if (timer->state & ENQUEUED ||
> > > 			    base->running == timer)
> > > 				return true;
> > >
> > > 		} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > > 			 base != READ_ONCE(timer->base->cpu_base));
> > >
> > > 		return false;
> > > 	}
> > >
> > > And we need to avoid the races with 2 transitions in __run_hrtimer().
> > >
> > > The first race is trivial, we change __run_hrtimer() to do
> > >
> > > 	write_seqcount_begin(cpu_base->seq);
> > > 	cpu_base->running = timer;
> > > 	__remove_hrtimer(timer);	// clears ENQUEUED
> > > 	write_seqcount_end(cpu_base->seq);
> >
> > We use seqcount, because we are afraid that hrtimer_active() may miss
> > timer->state or cpu_base->running, when we are clearing it.
> 
> Yes,
> 
> > If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
> > we may protect only the places where we do that:
> >
> > 	cpu_base->running = timer;
> > 	write_seqcount_begin(cpu_base->seq);
> > 	__remove_hrtimer(timer);	// clears ENQUEUED
> > 	write_seqcount_end(cpu_base->seq);
> >
> > 	....
> >
> > 	timer->state |= HRTIMER_STATE_ENQUEUED;
> > 	write_seqcount_begin(cpu_base->seq);
> > 	base->running = NULL;
> > 	write_seqcount_end(cpu_base->seq);
> 
> Afaics, no. Afaics, the following code is correct:
> 
> 	seqcount_t LOCK;
> 	bool X = true, Y = false;
> 
> 	void read(void)
> 	{
> 		bool x, y;
> 
> 		do {
> 			seq = read_seqcount_begin(&LOCK);
> 
> 			x = X; y = Y;
> 
> 		} while (read_seqcount_retry(&LOCK, seq));
> 
> 		BUG_ON(!x && !y);
> 	}
> 
> 	void write(void)
> 	{
> 		Y = true;
> 
> 		write_seqcount_begin(LOCK);
> 		write_seqcount_end(LOCK);
> 
> 		X = false;
> 	}
> 
> If we rely on the "locking" semantics of seqcount_t, this doesn't really
> differ from
> 
> 	spinlock_t LOCK;
> 	bool X = true, Y = false;
> 
> 	void read(void)
> 	{
> 		bool x, y;
> 
> 		spin_lock(LOCK);
> 		x = X; y = Y;
> 		spin_unlock(LOCK);
> 
> 		BUG_ON(!x && !y);
> 	}
> 
> 	void write(void)
> 	{
> 		Y = true;
> 
> 		spin_lock(LOCK);
> 		spin_unlock(LOCK);
> 
> 		X = false;
> 	}
> 
> If "read" takes the lock before "write", it must see X == true.
> 
> Otherwise "read" should see all memory changes done before or
> inside the "write" critical section, so it must see Y == true.
> 
> No?

I'm agree with you. Thanks for the explanation.


^ permalink raw reply	[flat|nested] 55+ messages in thread

end of thread, other threads:[~2015-06-11 16:25 UTC | newest]

Thread overview: 55+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-05  8:48 [PATCH 00/14] sched: balance callbacks Peter Zijlstra
2015-06-05  8:48 ` [PATCH 01/14] sched: Replace post_schedule with a balance callback list Peter Zijlstra
2015-06-05  8:48 ` [PATCH 02/14] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
2015-06-05  8:48 ` [PATCH 03/14] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
2015-06-05  8:48 ` [PATCH 04/14] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
2015-06-05  8:48 ` [PATCH 05/14] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
2015-06-05  8:48 ` [PATCH 06/14] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
2015-06-05  8:48 ` [PATCH 07/14] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
2015-06-05  8:48 ` [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
2015-06-05  9:48   ` Thomas Gleixner
2015-06-07 19:43   ` Oleg Nesterov
2015-06-07 22:33   ` Oleg Nesterov
2015-06-07 22:56     ` Oleg Nesterov
2015-06-08  8:06       ` Thomas Gleixner
2015-06-08  9:14     ` Peter Zijlstra
2015-06-08 10:55       ` Peter Zijlstra
2015-06-08 12:42       ` Peter Zijlstra
2015-06-08 14:27         ` Oleg Nesterov
2015-06-08 14:42           ` Peter Zijlstra
2015-06-08 15:49             ` Oleg Nesterov
2015-06-08 15:10           ` Peter Zijlstra
2015-06-08 15:16             ` Oleg Nesterov
2015-06-09 21:33         ` Oleg Nesterov
2015-06-09 21:39           ` Oleg Nesterov
2015-06-10  6:55           ` Peter Zijlstra
2015-06-10  7:46           ` Kirill Tkhai
2015-06-10 16:04             ` Oleg Nesterov
2015-06-11  7:31               ` Peter Zijlstra
2015-06-11 16:25               ` Kirill Tkhai
2015-06-10 15:49           ` Oleg Nesterov
2015-06-10 22:37           ` Peter Zijlstra
2015-06-08 14:03       ` Oleg Nesterov
2015-06-08 14:17       ` Peter Zijlstra
2015-06-08 15:10         ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Oleg Nesterov
2015-06-08 15:11           ` [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued Oleg Nesterov
2015-06-08 15:11           ` [PATCH 3/3] hrtimer: fix the __hrtimer_start_range_ns() race with hrtimer_active() Oleg Nesterov
2015-06-08 15:12           ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
2015-06-08 15:35           ` [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes Peter Zijlstra
2015-06-08 15:56             ` Oleg Nesterov
2015-06-08 17:11             ` Thomas Gleixner
2015-06-08 19:08               ` Peter Zijlstra
2015-06-08 20:52               ` Oleg Nesterov
2015-06-08 15:10         ` [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued() Oleg Nesterov
2015-06-08 15:13           ` Oleg Nesterov
2015-06-05  8:48 ` [PATCH 09/14] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
2015-06-05  8:48 ` [PATCH 10/14] sched: Move code around Peter Zijlstra
2015-06-05  8:48 ` [PATCH 11/14] sched: Streamline the task migration locking a little Peter Zijlstra
2015-06-05  8:48 ` [PATCH 12/14] lockdep: Simplify lock_release() Peter Zijlstra
2015-06-05  8:48 ` [PATCH 13/14] lockdep: Implement lock pinning Peter Zijlstra
2015-06-05  9:55   ` Ingo Molnar
2015-06-11 11:37     ` Peter Zijlstra
2015-06-05  8:48 ` [PATCH 14/14] sched,lockdep: Employ " Peter Zijlstra
2015-06-05  9:57   ` Ingo Molnar
2015-06-05 11:03     ` Peter Zijlstra
2015-06-05 11:24       ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).