linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4
@ 2011-01-04 14:59 Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 01/18] sched: Always provide p->on_cpu Peter Zijlstra
                   ` (19 more replies)
  0 siblings, 20 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

Latest version of the scary patches...



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 01/18] sched: Always provide p->on_cpu
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 02/18] mutex: Use p->on_cpu for the adaptive spin Peter Zijlstra
                   ` (18 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-on_cpu.patch --]
[-- Type: text/plain, Size: 3698 bytes --]

Always provide p->on_cpu so that we can determine if its on a cpu
without having to lock the rq.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    4 +---
 kernel/sched.c        |   46 +++++++++++++++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 20 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -845,18 +845,39 @@ static inline int task_current(struct rq
 	return rq->curr == p;
 }
 
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+	return p->on_cpu;
+#else
 	return task_current(rq, p);
+#endif
 }
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->on_cpu = 1;
+#endif
 }
 
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
@@ -872,15 +893,6 @@ static inline void finish_lock_switch(st
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-	return p->oncpu;
-#else
-	return task_current(rq, p);
-#endif
-}
-
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -889,7 +901,7 @@ static inline void prepare_lock_switch(s
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
-	next->oncpu = 1;
+	next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	raw_spin_unlock_irq(&rq->lock);
@@ -902,12 +914,12 @@ static inline void finish_lock_switch(st
 {
 #ifdef CONFIG_SMP
 	/*
-	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
-	prev->oncpu = 0;
+	prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
@@ -2645,8 +2657,8 @@ void sched_fork(struct task_struct *p, i
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-	p->oncpu = 0;
+#if defined(CONFIG_SMP)
+	p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
@@ -5557,8 +5569,8 @@ void __cpuinit init_idle(struct task_str
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-	idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1198,9 +1198,7 @@ struct task_struct {
 	int lock_depth;		/* BKL lock depth */
 
 #ifdef CONFIG_SMP
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-	int oncpu;
-#endif
+	int on_cpu;
 #endif
 
 	int prio, static_prio, normal_prio;



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 02/18] mutex: Use p->on_cpu for the adaptive spin
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 01/18] sched: Always provide p->on_cpu Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 03/18] sched: Change the ttwu success details Peter Zijlstra
                   ` (17 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-on_cpu-use.patch --]
[-- Type: text/plain, Size: 5818 bytes --]

Since we now have p->on_cpu unconditionally available, use it to
re-implement mutex_spin_on_owner.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mutex.h |    2 -
 include/linux/sched.h |    2 -
 kernel/mutex-debug.c  |    2 -
 kernel/mutex-debug.h  |    2 -
 kernel/mutex.c        |    2 -
 kernel/mutex.h        |    2 -
 kernel/sched.c        |   83 +++++++++++++++++++-------------------------------
 7 files changed, 39 insertions(+), 56 deletions(-)

Index: linux-2.6/include/linux/mutex.h
===================================================================
--- linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -51,7 +51,7 @@ struct mutex {
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
-	struct thread_info	*owner;
+	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
Index: linux-2.6/kernel/mutex-debug.c
===================================================================
--- linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lo
 		return;
 
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	DEBUG_LOCKS_WARN_ON(lock->owner != current);
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	mutex_clear_owner(lock);
 }
Index: linux-2.6/kernel/mutex-debug.h
===================================================================
--- linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mute
 
 static inline void mutex_set_owner(struct mutex *lock)
 {
-	lock->owner = current_thread_info();
+	lock->owner = current;
 }
 
 static inline void mutex_clear_owner(struct mutex *lock)
Index: linux-2.6/kernel/mutex.c
===================================================================
--- linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, 
 	 */
 
 	for (;;) {
-		struct thread_info *owner;
+		struct task_struct *owner;
 
 		/*
 		 * If we own the BKL, then don't spin. The owner of
Index: linux-2.6/kernel/mutex.h
===================================================================
--- linux-2.6.orig/kernel/mutex.h
+++ linux-2.6/kernel/mutex.h
@@ -19,7 +19,7 @@
 #ifdef CONFIG_SMP
 static inline void mutex_set_owner(struct mutex *lock)
 {
-	lock->owner = current_thread_info();
+	lock->owner = current;
 }
 
 static inline void mutex_clear_owner(struct mutex *lock)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4034,70 +4034,53 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
-	unsigned int cpu;
-	struct rq *rq;
 
-	if (!sched_feat(OWNER_SPIN))
-		return 0;
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+	bool ret = false;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * Need to access the cpu field knowing that
-	 * DEBUG_PAGEALLOC could have unmapped it if
-	 * the mutex owner just released it and exited.
-	 */
-	if (probe_kernel_address(&owner->cpu, cpu))
-		return 0;
-#else
-	cpu = owner->cpu;
-#endif
+	rcu_read_lock();
+	if (lock->owner != owner)
+		goto fail;
 
 	/*
-	 * Even if the access succeeded (likely case),
-	 * the cpu field may no longer be valid.
+	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
+	 * lock->owner still matches owner, if that fails, owner might
+	 * point to free()d memory, if it still matches, the rcu_read_lock()
+	 * ensures the memory stays valid.
 	 */
-	if (cpu >= nr_cpumask_bits)
-		return 0;
+	barrier();
 
-	/*
-	 * We need to validate that we can do a
-	 * get_cpu() and that we have the percpu area.
-	 */
-	if (!cpu_online(cpu))
-		return 0;
+	ret = owner->on_cpu;
+fail:
+	rcu_read_unlock();
 
-	rq = cpu_rq(cpu);
+	return ret;
+}
 
-	for (;;) {
-		/*
-		 * Owner changed, break to re-assess state.
-		 */
-		if (lock->owner != owner) {
-			/*
-			 * If the lock has switched to a different owner,
-			 * we likely have heavy contention. Return 0 to quit
-			 * optimistic spinning and not contend further:
-			 */
-			if (lock->owner)
-				return 0;
-			break;
-		}
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
 
-		/*
-		 * Is that owner really running on that cpu?
-		 */
-		if (task_thread_info(rq->curr) != owner || need_resched())
+	while (owner_running(lock, owner)) {
+		if (need_resched())
 			return 0;
 
 		arch_mutex_cpu_relax();
 	}
 
+	/*
+	 * If the owner changed to another task there is likely
+	 * heavy contention, stop spinning.
+	 */
+	if (lock->owner)
+		return 0;
+
 	return 1;
 }
 #endif
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -360,7 +360,7 @@ extern signed long schedule_timeout_inte
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
-extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
+extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
 
 struct nsproxy;
 struct user_namespace;



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 03/18] sched: Change the ttwu success details
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 01/18] sched: Always provide p->on_cpu Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 02/18] mutex: Use p->on_cpu for the adaptive spin Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 04/18] sched: Clean up ttwu stats Peter Zijlstra
                   ` (16 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-change-ttwu-return.patch --]
[-- Type: text/plain, Size: 2431 bytes --]

try_to_wake_up() would only return a success when it would have to
place a task on a rq, change that to every time we change p->state to
TASK_RUNNING, because that's the real measure of wakeups.

This results in that success is always true for the tracepoints.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2383,10 +2383,10 @@ static inline void ttwu_activate(struct 
 	activate_task(rq, p, en_flags);
 }
 
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-					int wake_flags, bool success)
+static void
+ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
 {
-	trace_sched_wakeup(p, success);
+	trace_sched_wakeup(p, true);
 	check_preempt_curr(rq, p, wake_flags);
 
 	p->state = TASK_RUNNING;
@@ -2406,7 +2406,7 @@ static inline void ttwu_post_activation(
 	}
 #endif
 	/* if a worker is waking up, notify workqueue */
-	if ((p->flags & PF_WQ_WORKER) && success)
+	if (p->flags & PF_WQ_WORKER)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 
@@ -2505,9 +2505,9 @@ static int try_to_wake_up(struct task_st
 #endif /* CONFIG_SMP */
 	ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
 		      cpu == this_cpu, en_flags);
-	success = 1;
 out_running:
-	ttwu_post_activation(p, rq, wake_flags, success);
+	ttwu_post_activation(p, rq, wake_flags);
+	success = 1;
 out:
 	task_rq_unlock(rq, &flags);
 	put_cpu();
@@ -2526,7 +2526,6 @@ static int try_to_wake_up(struct task_st
 static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
-	bool success = false;
 
 	BUG_ON(rq != this_rq());
 	BUG_ON(p == current);
@@ -2541,9 +2540,8 @@ static void try_to_wake_up_local(struct 
 			schedstat_inc(rq, ttwu_local);
 		}
 		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
-		success = true;
 	}
-	ttwu_post_activation(p, rq, 0, success);
+	ttwu_post_activation(p, rq, 0);
 }
 
 /**
@@ -2705,7 +2703,7 @@ void wake_up_new_task(struct task_struct
 
 	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
-	trace_sched_wakeup_new(p, 1);
+	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 04/18] sched: Clean up ttwu stats
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (2 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 03/18] sched: Change the ttwu success details Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
                   ` (15 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-stat.patch --]
[-- Type: text/plain, Size: 3241 bytes --]

Collect all ttwu stat code into a single function and ensure its
always called for an actual wakeup (changing p->state to
TASK_RUNNING).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   67 +++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 35 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2366,21 +2366,36 @@ static void update_avg(u64 *avg, u64 sam
 }
 #endif
 
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-				 bool is_sync, bool is_migrate, bool is_local,
-				 unsigned long en_flags)
+static void
+ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
 {
+#ifdef CONFIG_SCHEDSTATS
+	int this_cpu = smp_processor_id();
+
+	schedstat_inc(rq, ttwu_count);
 	schedstat_inc(p, se.statistics.nr_wakeups);
-	if (is_sync)
+
+	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.statistics.nr_wakeups_sync);
-	if (is_migrate)
+
+	if (cpu != task_cpu(p))
 		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-	if (is_local)
+
+	if (cpu == this_cpu) {
+		schedstat_inc(rq, ttwu_local);
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
-	else
-		schedstat_inc(p, se.statistics.nr_wakeups_remote);
+	} else {
+		struct sched_domain *sd;
 
-	activate_task(rq, p, en_flags);
+		schedstat_inc(p, se.statistics.nr_wakeups_remote);
+		for_each_domain(this_cpu, sd) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+#endif /* CONFIG_SCHEDSTATS */
 }
 
 static void
@@ -2440,12 +2455,12 @@ static int try_to_wake_up(struct task_st
 	if (!(p->state & state))
 		goto out;
 
+	cpu = task_cpu(p);
+
 	if (p->se.on_rq)
 		goto out_running;
 
-	cpu = task_cpu(p);
 	orig_cpu = cpu;
-
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
@@ -2486,27 +2501,12 @@ static int try_to_wake_up(struct task_st
 	WARN_ON(task_cpu(p) != cpu);
 	WARN_ON(p->state != TASK_WAKING);
 
-#ifdef CONFIG_SCHEDSTATS
-	schedstat_inc(rq, ttwu_count);
-	if (cpu == this_cpu)
-		schedstat_inc(rq, ttwu_local);
-	else {
-		struct sched_domain *sd;
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
-		}
-	}
-#endif /* CONFIG_SCHEDSTATS */
-
 out_activate:
 #endif /* CONFIG_SMP */
-	ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-		      cpu == this_cpu, en_flags);
+	activate_task(rq, p, en_flags);
 out_running:
 	ttwu_post_activation(p, rq, wake_flags);
+	ttwu_stat(rq, p, cpu, wake_flags);
 	success = 1;
 out:
 	task_rq_unlock(rq, &flags);
@@ -2534,14 +2534,11 @@ static void try_to_wake_up_local(struct 
 	if (!(p->state & TASK_NORMAL))
 		return;
 
-	if (!p->se.on_rq) {
-		if (likely(!task_running(rq, p))) {
-			schedstat_inc(rq, ttwu_count);
-			schedstat_inc(rq, ttwu_local);
-		}
-		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
-	}
+	if (!p->se.on_rq)
+		activate_task(rq, p, ENQUEUE_WAKEUP);
+
 	ttwu_post_activation(p, rq, 0);
+	ttwu_stat(rq, p, smp_processor_id(), 0);
 }
 
 /**



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 05/18] sched: Provide p->on_rq
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (3 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 04/18] sched: Clean up ttwu stats Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-05  8:13   ` Yong Zhang
  2011-01-29  0:10   ` Frank Rowand
  2011-01-04 14:59 ` [RFC][PATCH 06/18] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Peter Zijlstra
                   ` (14 subsequent siblings)
  19 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-onrq.patch --]
[-- Type: text/plain, Size: 8343 bytes --]

Provide a generic p->on_rq because the p->se.on_rq semantics are
unfavourable for lockless wakeups but needed for sched_fair.

In particular, p->on_rq is only cleared when we actually dequeue the
task in schedule() and not on any random dequeue as done by things
like __migrate_task() and __sched_setscheduler().

This also allows us to remove p->se usage from !sched_fair code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h   |    1 +
 kernel/sched.c          |   36 ++++++++++++++++++------------------
 kernel/sched_debug.c    |    2 +-
 kernel/sched_rt.c       |   10 +++++-----
 kernel/sched_stoptask.c |    2 +-
 5 files changed, 26 insertions(+), 25 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1200,6 +1200,7 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	int on_cpu;
 #endif
+	int on_rq;
 
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1759,7 +1759,6 @@ static void enqueue_task(struct rq *rq, 
 	update_rq_clock(rq);
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, flags);
-	p->se.on_rq = 1;
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1767,7 +1766,6 @@ static void dequeue_task(struct rq *rq, 
 	update_rq_clock(rq);
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, flags);
-	p->se.on_rq = 0;
 }
 
 /*
@@ -1780,6 +1778,7 @@ static void activate_task(struct rq *rq,
 
 	enqueue_task(rq, p, flags);
 	inc_nr_running(rq);
+	p->on_rq = 1;
 }
 
 /*
@@ -2070,7 +2069,7 @@ static void check_preempt_curr(struct rq
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -2145,7 +2144,7 @@ static bool migrate_task(struct task_str
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
 	 */
-	return p->se.on_rq || task_running(rq, p);
+	return p->on_rq || task_running(rq, p);
 }
 
 /*
@@ -2205,7 +2204,7 @@ unsigned long wait_task_inactive(struct 
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
-		on_rq = p->se.on_rq;
+		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -2457,7 +2456,7 @@ static int try_to_wake_up(struct task_st
 
 	cpu = task_cpu(p);
 
-	if (p->se.on_rq)
+	if (p->on_rq)
 		goto out_running;
 
 	orig_cpu = cpu;
@@ -2534,7 +2533,7 @@ static void try_to_wake_up_local(struct 
 	if (!(p->state & TASK_NORMAL))
 		return;
 
-	if (!p->se.on_rq)
+	if (!p->on_rq)
 		activate_task(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_post_activation(p, rq, 0);
@@ -2571,18 +2570,20 @@ int wake_up_state(struct task_struct *p,
  */
 static void __sched_fork(struct task_struct *p)
 {
+	p->on_rq				= 0;
+
+	p->se.on_rq			= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
+	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
-	p->se.on_rq = 0;
-	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3904,7 +3905,7 @@ static inline void schedule_debug(struct
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	if (prev->se.on_rq)
+	if (prev->on_rq)
 		update_rq_clock(rq);
 	prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3983,6 +3984,7 @@ asmlinkage void __sched schedule(void)
 					try_to_wake_up_local(to_wakeup);
 			}
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+			prev->on_rq = 0;
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -4546,7 +4548,7 @@ void rt_mutex_setprio(struct task_struct
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
-	on_rq = p->se.on_rq;
+	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
@@ -4595,7 +4597,7 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	on_rq = p->se.on_rq;
+	on_rq = p->on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 
@@ -4729,8 +4731,6 @@ static struct task_struct *find_process_
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(p->se.on_rq);
-
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
@@ -4878,7 +4878,7 @@ static int __sched_setscheduler(struct t
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	on_rq = p->se.on_rq;
+	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		deactivate_task(rq, p, 0);
@@ -5737,7 +5737,7 @@ static int __migrate_task(struct task_st
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
-	if (p->se.on_rq) {
+	if (p->on_rq) {
 		deactivate_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
 		activate_task(rq_dest, p, 0);
@@ -8106,7 +8106,7 @@ static void normalize_task(struct rq *rq
 {
 	int on_rq;
 
-	on_rq = p->se.on_rq;
+	on_rq = p->on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8449,7 +8449,7 @@ void sched_move_task(struct task_struct 
 	rq = task_rq_lock(tsk, &flags);
 
 	running = task_current(rq, tsk);
-	on_rq = tsk->se.on_rq;
+	on_rq = tsk->on_rq;
 
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -127,7 +127,7 @@ static void print_rq(struct seq_file *m,
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, p) {
-		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+		if (!p->on_rq || task_cpu(p) != rq_cpu)
 			continue;
 
 		print_task(m, rq, p);
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -1132,7 +1132,7 @@ static void put_prev_task_rt(struct rq *
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+	if (p->on_rq && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1283,7 +1283,7 @@ static struct rq *find_lock_lowest_rq(st
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       &task->cpus_allowed) ||
 				     task_running(rq, task) ||
-				     !task->se.on_rq)) {
+				     !task->on_rq)) {
 
 				raw_spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
@@ -1317,7 +1317,7 @@ static struct task_struct *pick_next_pus
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->rt.nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->se.on_rq);
+	BUG_ON(!p->on_rq);
 	BUG_ON(!rt_task(p));
 
 	return p;
@@ -1463,7 +1463,7 @@ static int pull_rt_task(struct rq *this_
 		 */
 		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->se.on_rq);
+			WARN_ON(!p->on_rq);
 
 			/*
 			 * There's a chance that p is higher in priority
@@ -1534,7 +1534,7 @@ static void set_cpus_allowed_rt(struct t
 	 * Update the migration status of the RQ if we have an RT task
 	 * which is running AND changing its weight value.
 	 */
-	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+	if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
 		if (!task_current(rq, p)) {
Index: linux-2.6/kernel/sched_stoptask.c
===================================================================
--- linux-2.6.orig/kernel/sched_stoptask.c
+++ linux-2.6/kernel/sched_stoptask.c
@@ -26,7 +26,7 @@ static struct task_struct *pick_next_tas
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->se.on_rq)
+	if (stop && stop->on_rq)
 		return stop;
 
 	return NULL;



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 06/18] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (4 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
                   ` (13 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-pi_lock-wakeup.patch --]
[-- Type: text/plain, Size: 3615 bytes --]

Currently p->pi_lock already serializes p->sched_class, also put
p->cpus_allowed and try_to_wake_up() under it, this prepares the way
to do the first part of ttwu() without holding rq->lock.

By having p->sched_class and p->cpus_allowed serialized by p->pi_lock,
we prepare the way to call select_task_rq() without holding rq->lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2301,7 +2301,7 @@ void task_oncpu_function_call(struct tas
 
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2334,7 +2334,7 @@ static int select_fallback_rq(int cpu, s
 }
 
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
@@ -2450,7 +2450,8 @@ static int try_to_wake_up(struct task_st
 	this_cpu = get_cpu();
 
 	smp_wmb();
-	rq = task_rq_lock(p, &flags);
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	rq = __task_rq_lock(p);
 	if (!(p->state & state))
 		goto out;
 
@@ -2508,7 +2509,8 @@ static int try_to_wake_up(struct task_st
 	ttwu_stat(rq, p, cpu, wake_flags);
 	success = 1;
 out:
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	put_cpu();
 
 	return success;
@@ -4543,6 +4545,8 @@ void rt_mutex_setprio(struct task_struct
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
+	lockdep_assert_held(&p->pi_lock);
+
 	rq = task_rq_lock(p, &flags);
 
 	trace_sched_pi_setprio(p, prio);
@@ -5150,7 +5154,6 @@ long sched_getaffinity(pid_t pid, struct
 {
 	struct task_struct *p;
 	unsigned long flags;
-	struct rq *rq;
 	int retval;
 
 	get_online_cpus();
@@ -5165,9 +5168,9 @@ long sched_getaffinity(pid_t pid, struct
 	if (retval)
 		goto out_unlock;
 
-	rq = task_rq_lock(p, &flags);
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-	task_rq_unlock(rq, &flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
 	rcu_read_unlock();
@@ -5652,18 +5655,8 @@ int set_cpus_allowed_ptr(struct task_str
 	unsigned int dest_cpu;
 	int ret = 0;
 
-	/*
-	 * Serialize against TASK_WAKING so that ttwu() and wunt() can
-	 * drop the rq->lock and still rely on ->cpus_allowed.
-	 */
-again:
-	while (task_is_waking(p))
-		cpu_relax();
-	rq = task_rq_lock(p, &flags);
-	if (task_is_waking(p)) {
-		task_rq_unlock(rq, &flags);
-		goto again;
-	}
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	rq = __task_rq_lock(p);
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
@@ -5691,13 +5684,15 @@ int set_cpus_allowed_ptr(struct task_str
 	if (migrate_task(p, rq)) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, &flags);
+		__task_rq_unlock(rq);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 	return ret;
 }



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (5 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 06/18] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-06 13:57   ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 08/18] sched: Remove rq argument to sched_class::task_waking() Peter Zijlstra
                   ` (12 subsequent siblings)
  19 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-select_task_rq.patch --]
[-- Type: text/plain, Size: 6435 bytes --]

In preparation of calling select_task_rq() without rq->lock held, drop
the dependency on the rq argument.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h   |    3 +--
 kernel/sched.c          |   20 +++++++++++---------
 kernel/sched_fair.c     |    2 +-
 kernel/sched_idletask.c |    2 +-
 kernel/sched_rt.c       |   10 +++++++++-
 kernel/sched_stoptask.c |    3 +--
 6 files changed, 24 insertions(+), 16 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1063,8 +1063,7 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
-			       int sd_flag, int flags);
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2138,13 +2138,15 @@ static int migration_cpu_stop(void *data
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
+static bool need_migrate_task(struct task_struct *p)
 {
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
 	 */
-	return p->on_rq || task_running(rq, p);
+	bool running = p->on_rq || p->on_cpu;
+	smp_rmb(); /* finish_lock_switch() */
+	return running;
 }
 
 /*
@@ -2337,9 +2339,9 @@ static int select_fallback_rq(int cpu, s
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -2484,7 +2486,7 @@ static int try_to_wake_up(struct task_st
 		en_flags |= ENQUEUE_WAKING;
 	}
 
-	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
 	__task_rq_unlock(rq);
@@ -2694,7 +2696,7 @@ void wake_up_new_task(struct task_struct
 	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
 	 * without people poking at ->cpus_allowed.
 	 */
-	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
 	set_task_cpu(p, cpu);
 
 	p->state = TASK_RUNNING;
@@ -3420,7 +3422,7 @@ void sched_exec(void)
 	int dest_cpu;
 
 	rq = task_rq_lock(p, &flags);
-	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
@@ -3428,7 +3430,7 @@ void sched_exec(void)
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
 	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-	    likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+	    likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
 		struct migration_arg arg = { p, dest_cpu };
 
 		task_rq_unlock(rq, &flags);
@@ -5681,7 +5683,7 @@ int set_cpus_allowed_ptr(struct task_str
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (migrate_task(p, rq)) {
+	if (need_migrate_task(p)) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		__task_rq_unlock(rq);
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1623,7 +1623,7 @@ static int select_idle_sibling(struct ta
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
Index: linux-2.6/kernel/sched_idletask.c
===================================================================
--- linux-2.6.orig/kernel/sched_idletask.c
+++ linux-2.6/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -973,11 +973,18 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 
 static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
+#if 0
+	/*
+	 * XXX without holding rq->lock the below is racy, need to
+	 * rewrite it in a racy but non-dangerous way so that we mostly
+	 * get the benefit of the heuristic but don't crash the kernel
+	 * if we get it wrong ;-)
+	 */
 	/*
 	 * If the current task is an RT task, then
 	 * try to see if we can wake this RT task up on another
@@ -1002,6 +1009,7 @@ select_task_rq_rt(struct rq *rq, struct 
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
 	}
+#endif
 
 	/*
 	 * Otherwise, just let it ride on the affined RQ and the
Index: linux-2.6/kernel/sched_stoptask.c
===================================================================
--- linux-2.6.orig/kernel/sched_stoptask.c
+++ linux-2.6/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct rq *rq, struct task_struct *p,
-		    int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* stop tasks as never migrate */
 }



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 08/18] sched: Remove rq argument to sched_class::task_waking()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (6 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 09/18] sched: Delay task_contributes_to_load() Peter Zijlstra
                   ` (11 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-task_waking.patch --]
[-- Type: text/plain, Size: 2230 bytes --]

In preparation of calling this without rq->lock held, remove the
dependency on the rq argument.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |   10 +++++++---
 kernel/sched.c        |    2 +-
 kernel/sched_fair.c   |    4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1045,8 +1045,12 @@ struct sched_domain;
 #define WF_FORK		0x02		/* child wakeup after fork */
 
 #define ENQUEUE_WAKEUP		1
-#define ENQUEUE_WAKING		2
-#define ENQUEUE_HEAD		4
+#define ENQUEUE_HEAD		2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING		0
+#endif
 
 #define DEQUEUE_SLEEP		1
 
@@ -1067,7 +1071,7 @@ struct sched_class {
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
-	void (*task_waking) (struct rq *this_rq, struct task_struct *task);
+	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2481,7 +2481,7 @@ static int try_to_wake_up(struct task_st
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking) {
-		p->sched_class->task_waking(rq, p);
+		p->sched_class->task_waking(p);
 		en_flags |= ENQUEUE_WAKING;
 	}
 
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1338,11 +1338,13 @@ static void yield_task_fair(struct rq *r
 
 #ifdef CONFIG_SMP
 
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
+	lockdep_assert_held(&task_rq(p)->lock);
+
 	se->vruntime -= cfs_rq->min_vruntime;
 }
 



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 09/18] sched: Delay task_contributes_to_load()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (7 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 08/18] sched: Remove rq argument to sched_class::task_waking() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 10/18] sched: Also serialize ttwu_local() with p->pi_lock Peter Zijlstra
                   ` (10 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-contribute-to-load.patch --]
[-- Type: text/plain, Size: 1771 bytes --]

In prepratation of having to call task_contributes_to_load() without
holding rq->lock, we need to store the result until we do and can
update the rq accounting accordingly.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    1 +
 kernel/sched.c        |   16 ++++------------
 2 files changed, 5 insertions(+), 12 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1264,6 +1264,7 @@ struct task_struct {
 
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
+	unsigned sched_contributes_to_load:1;
 
 	pid_t pid;
 	pid_t tgid;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2467,18 +2467,7 @@ static int try_to_wake_up(struct task_st
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	/*
-	 * In order to handle concurrent wakeups and release the rq->lock
-	 * we put the task in TASK_WAKING state.
-	 *
-	 * First fix up the nr_uninterruptible count:
-	 */
-	if (task_contributes_to_load(p)) {
-		if (likely(cpu_online(orig_cpu)))
-			rq->nr_uninterruptible--;
-		else
-			this_rq()->nr_uninterruptible--;
-	}
+	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking) {
@@ -2503,6 +2492,9 @@ static int try_to_wake_up(struct task_st
 	WARN_ON(task_cpu(p) != cpu);
 	WARN_ON(p->state != TASK_WAKING);
 
+	if (p->sched_contributes_to_load)
+		rq->nr_uninterruptible--;
+
 out_activate:
 #endif /* CONFIG_SMP */
 	activate_task(rq, p, en_flags);



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 10/18] sched: Also serialize ttwu_local() with p->pi_lock
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (8 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 09/18] sched: Delay task_contributes_to_load() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Peter Zijlstra
                   ` (9 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu_local.patch --]
[-- Type: text/plain, Size: 2481 bytes --]

Since we now serialize ttwu() using p->pi_lock, we also need to
serialize ttwu_local() using that, otherwise, once we drop the
rq->lock from ttwu() it can race with ttwu_local().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2513,9 +2513,9 @@ static int try_to_wake_up(struct task_st
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not alredy there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
  */
 static void try_to_wake_up_local(struct task_struct *p)
 {
@@ -2523,16 +2523,21 @@ static void try_to_wake_up_local(struct 
 
 	BUG_ON(rq != this_rq());
 	BUG_ON(p == current);
-	lockdep_assert_held(&rq->lock);
+
+	raw_spin_unlock(&rq->lock);
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
 
 	if (!(p->state & TASK_NORMAL))
-		return;
+		goto out;
 
 	if (!p->on_rq)
 		activate_task(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_post_activation(p, rq, 0);
 	ttwu_stat(rq, p, smp_processor_id(), 0);
+out:
+	raw_spin_unlock(&p->pi_lock);
 }
 
 /**
@@ -3925,6 +3930,7 @@ pick_next_task(struct rq *rq)
  */
 asmlinkage void __sched schedule(void)
 {
+	struct task_struct *to_wakeup = NULL;
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
@@ -3958,21 +3964,21 @@ asmlinkage void __sched schedule(void)
 			 * task to maintain concurrency.  If so, wake
 			 * up the task.
 			 */
-			if (prev->flags & PF_WQ_WORKER) {
-				struct task_struct *to_wakeup;
-
+			if (prev->flags & PF_WQ_WORKER)
 				to_wakeup = wq_worker_sleeping(prev, cpu);
-				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup);
-			}
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
 		}
 		switch_count = &prev->nvcsw;
 	}
 
+	/*
+	 * All three: try_to_wake_up_local(), pre_schedule() and idle_balance()
+	 * can drop rq->lock.
+	 */
+	if (to_wakeup)
+		try_to_wake_up_local(to_wakeup);
 	pre_schedule(rq, prev);
-
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (9 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 10/18] sched: Also serialize ttwu_local() with p->pi_lock Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-05 18:46   ` Oleg Nesterov
  2011-01-29  0:21   ` Frank Rowand
  2011-01-04 14:59 ` [RFC][PATCH 12/18] sched: Drop rq->lock from first part of wake_up_new_task() Peter Zijlstra
                   ` (8 subsequent siblings)
  19 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-task_rq_lock.patch --]
[-- Type: text/plain, Size: 8014 bytes --]

In order to be able to call set_task_cpu() while either holding
p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
stabilize task_rq().

This makes task_rq_lock() acquire both locks, and have
__task_rq_lock() validate that p->pi_lock is held. This increases the
locking overhead for most scheduler syscalls but allows reduction of
rq->lock contention for some scheduler hot paths (ttwu).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   81 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 44 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -602,7 +602,7 @@ static inline int cpu_of(struct rq *rq)
  * Return the group to which this tasks belongs.
  *
  * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
  * holds that lock for each task it moves into the cgroup. Therefore
  * by holding that lock, we pin the task to the current cgroup.
  */
@@ -612,7 +612,7 @@ static inline struct task_group *task_gr
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-			lockdep_is_held(&task_rq(p)->lock));
+			lockdep_is_held(&p->pi_lock));
 	tg = container_of(css, struct task_group, css);
 
 	return autogroup_task_group(p, tg);
@@ -928,23 +928,15 @@ static inline void finish_lock_switch(st
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-	return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 
+	lockdep_assert_held(&p->pi_lock);
+
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
@@ -955,22 +947,22 @@ static inline struct rq *__task_rq_lock(
 }
 
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 
 	for (;;) {
-		local_irq_save(*flags);
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
-		raw_spin_unlock_irqrestore(&rq->lock, *flags);
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 	}
 }
 
@@ -980,10 +972,13 @@ static void __task_rq_unlock(struct rq *
 	raw_spin_unlock(&rq->lock);
 }
 
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
+	__releases(p->pi_lock)
 {
-	raw_spin_unlock_irqrestore(&rq->lock, *flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
 /*
@@ -2115,6 +2110,11 @@ void set_task_cpu(struct task_struct *p,
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2210,7 +2210,7 @@ unsigned long wait_task_inactive(struct 
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, p, &flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -2596,6 +2596,7 @@ static void __sched_fork(struct task_str
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
+	unsigned long flags;
 	int cpu = get_cpu();
 
 	__sched_fork(p);
@@ -2646,9 +2647,9 @@ void sched_fork(struct task_struct *p, i
 	 *
 	 * Silence PROVE_RCU.
 	 */
-	rcu_read_lock();
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
-	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -3472,7 +3473,7 @@ unsigned long long task_delta_exec(struc
 
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -3490,7 +3491,7 @@ unsigned long long task_sched_runtime(st
 
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -3514,7 +3515,7 @@ unsigned long long thread_group_sched_ru
 	rq = task_rq_lock(p, &flags);
 	thread_group_cputime(p, &totals);
 	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -4538,16 +4539,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	lockdep_assert_held(&p->pi_lock);
-
-	rq = task_rq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
@@ -4573,7 +4571,7 @@ void rt_mutex_setprio(struct task_struct
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
 }
 
 #endif
@@ -4621,7 +4619,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(rq->curr);
 	}
 out_unlock:
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4843,13 +4841,11 @@ static int __sched_setscheduler(struct t
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
+	 *
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
-	rq = __task_rq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
@@ -4902,8 +4898,7 @@ static int __sched_setscheduler(struct t
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
-	__task_rq_unlock(rq);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rt_mutex_adjust_pi(p);
 
@@ -5432,7 +5427,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 
 	rq = task_rq_lock(p, &flags);
 	time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
@@ -5655,8 +5650,7 @@ int set_cpus_allowed_ptr(struct task_str
 	unsigned int dest_cpu;
 	int ret = 0;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	rq = __task_rq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
@@ -5691,8 +5685,7 @@ int set_cpus_allowed_ptr(struct task_str
 		return 0;
 	}
 out:
-	__task_rq_unlock(rq);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ret;
 }
@@ -8463,7 +8456,7 @@ void sched_move_task(struct task_struct 
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 12/18] sched: Drop rq->lock from first part of wake_up_new_task()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (10 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 13/18] sched: Drop rq->lock from sched_exec() Peter Zijlstra
                   ` (7 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-wunt.patch --]
[-- Type: text/plain, Size: 1691 bytes --]

Since p->pi_lock now protects all things needed to call
select_task_rq() avoid the double remote rq->lock acquisition and rely
on p->pi_lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2681,28 +2681,18 @@ void wake_up_new_task(struct task_struct
 {
 	unsigned long flags;
 	struct rq *rq;
-	int cpu __maybe_unused = get_cpu();
 
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-	rq = task_rq_lock(p, &flags);
-	p->state = TASK_WAKING;
-
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
-	 *
-	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-	 * without people poking at ->cpus_allowed.
 	 */
-	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-	set_task_cpu(p, cpu);
-
-	p->state = TASK_RUNNING;
-	task_rq_unlock(rq, &flags);
+	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
 #endif
 
-	rq = task_rq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -2710,8 +2700,8 @@ void wake_up_new_task(struct task_struct
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 #endif
-	task_rq_unlock(rq, &flags);
-	put_cpu();
+	__task_rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 13/18] sched: Drop rq->lock from sched_exec()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (11 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 12/18] sched: Drop rq->lock from first part of wake_up_new_task() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
                   ` (6 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-exec.patch --]
[-- Type: text/plain, Size: 1518 bytes --]

Since we can now call select_task_rq() and set_task_cpu() with only
p->pi_lock held, and sched_exec() load-balancing has always been
optimistic, drop all rq->lock usage.

Oleg also noted that need_migrate_task() will always be true for
current, so don't bother calling that at all.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -3412,27 +3412,22 @@ void sched_exec(void)
 {
 	struct task_struct *p = current;
 	unsigned long flags;
-	struct rq *rq;
 	int dest_cpu;
 
-	rq = task_rq_lock(p, &flags);
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
-	/*
-	 * select_task_rq() can race against ->cpus_allowed
-	 */
-	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-	    likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
+	if (likely(cpu_active(dest_cpu))) {
 		struct migration_arg arg = { p, dest_cpu };
 
-		task_rq_unlock(rq, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
-	task_rq_unlock(rq, &flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 #endif



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (12 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 13/18] sched: Drop rq->lock from sched_exec() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-06 16:29   ` Peter Zijlstra
  2011-01-29  1:05   ` Frank Rowand
  2011-01-04 14:59 ` [RFC][PATCH 15/18] sched: Remove rq argument from ttwu_stat() Peter Zijlstra
                   ` (5 subsequent siblings)
  19 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-optimize.patch --]
[-- Type: text/plain, Size: 3795 bytes --]

Currently ttwu() does two rq->lock acquisitions, once on the task's
old rq, holding it over the p->state fiddling and load-balance pass.
Then it drops the old rq->lock to acquire the new rq->lock.

By having serialized ttwu(), p->sched_class, p->cpus_allowed with
p->pi_lock, we can now drop the whole first rq->lock acquisition.

The p->pi_lock serializing concurrent ttwu() calls protects p->state,
which we will set to TASK_WAKING to bridge possible p->pi_lock to
rq->lock gaps and serialize set_task_cpu() calls against
task_rq_lock().

The p->pi_lock serialization of p->sched_class allows us to call
scheduling class methods without holding the rq->lock, and the
serialization of p->cpus_allowed allows us to do the load-balancing
bits without races.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      |   47 +++++++++++++++++++----------------------------
 kernel/sched_fair.c |    3 +--
 2 files changed, 20 insertions(+), 30 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2436,69 +2436,60 @@ ttwu_post_activation(struct task_struct 
  * Returns %true if @p was woken up, %false if it was already running
  * or @state didn't match @p's state.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
-			  int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-	int cpu, orig_cpu, this_cpu, success = 0;
+	int cpu, this_cpu, success = 0;
 	unsigned long flags;
-	unsigned long en_flags = ENQUEUE_WAKEUP;
 	struct rq *rq;
 
 	this_cpu = get_cpu();
 
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	rq = __task_rq_lock(p);
 	if (!(p->state & state))
 		goto out;
 
 	cpu = task_cpu(p);
 
-	if (p->on_rq)
-		goto out_running;
+	if (p->on_rq) {
+		rq = __task_rq_lock(p);
+		if (p->on_rq)
+			goto out_running;
+		__task_rq_unlock(rq);
+	}
 
-	orig_cpu = cpu;
 #ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p)))
-		goto out_activate;
+	while (p->on_cpu)
+		cpu_relax();
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking) {
+	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
-		en_flags |= ENQUEUE_WAKING;
-	}
 
 	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu)
-		set_task_cpu(p, cpu);
-	__task_rq_unlock(rq);
+#endif /* CONFIG_SMP */
 
 	rq = cpu_rq(cpu);
 	raw_spin_lock(&rq->lock);
 
-	/*
-	 * We migrated the task without holding either rq->lock, however
-	 * since the task is not on the task list itself, nobody else
-	 * will try and migrate the task, hence the rq should match the
-	 * cpu we just moved it to.
-	 */
-	WARN_ON(task_cpu(p) != cpu);
-	WARN_ON(p->state != TASK_WAKING);
+#ifdef CONFIG_SMP
+	if (cpu != task_cpu(p))
+		set_task_cpu(p, cpu);
 
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
+#endif
 
-out_activate:
-#endif /* CONFIG_SMP */
-	activate_task(rq, p, en_flags);
+	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 out_running:
 	ttwu_post_activation(p, rq, wake_flags);
 	ttwu_stat(rq, p, cpu, wake_flags);
 	success = 1;
-out:
 	__task_rq_unlock(rq);
+out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	put_cpu();
 
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1343,8 +1343,7 @@ static void task_waking_fair(struct task
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	lockdep_assert_held(&task_rq(p)->lock);
-
+	// XXX racy on 32bit
 	se->vruntime -= cfs_rq->min_vruntime;
 }
 



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 15/18] sched: Remove rq argument from ttwu_stat()
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (13 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-04 14:59 ` [RFC][PATCH 16/18] sched: Rename ttwu_post_activation Peter Zijlstra
                   ` (4 subsequent siblings)
  19 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu_stat-rq.patch --]
[-- Type: text/plain, Size: 1535 bytes --]

In order to call ttwu_stat() without holding rq->lock we must remove
its rq argument. Since we need to change rq stats, account to the
local rq instead of the task rq, this is safe since we have IRQs
disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2367,10 +2367,11 @@ static void update_avg(u64 *avg, u64 sam
 #endif
 
 static void
-ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
 #ifdef CONFIG_SCHEDSTATS
 	int this_cpu = smp_processor_id();
+	struct rq *rq = this_rq();
 
 	schedstat_inc(rq, ttwu_count);
 	schedstat_inc(p, se.statistics.nr_wakeups);
@@ -2491,9 +2492,10 @@ try_to_wake_up(struct task_struct *p, un
 	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 out_running:
 	ttwu_post_activation(p, rq, wake_flags);
-	ttwu_stat(rq, p, cpu, wake_flags);
 	success = 1;
 	__task_rq_unlock(rq);
+
+	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	put_cpu();
@@ -2527,7 +2529,7 @@ static void try_to_wake_up_local(struct 
 		activate_task(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_post_activation(p, rq, 0);
-	ttwu_stat(rq, p, smp_processor_id(), 0);
+	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 16/18] sched: Rename ttwu_post_activation
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (14 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 15/18] sched: Remove rq argument from ttwu_stat() Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-29  1:08   ` Frank Rowand
  2011-01-04 14:59 ` [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu Peter Zijlstra
                   ` (3 subsequent siblings)
  19 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu_do_wakeup.patch --]
[-- Type: text/plain, Size: 1367 bytes --]

The ttwu_post_actication() does the core wakeup, it sets TASK_RUNNING
and performs wakeup-preemption, so give is a more descriptive name.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2399,8 +2399,11 @@ ttwu_stat(struct task_struct *p, int cpu
 #endif /* CONFIG_SCHEDSTATS */
 }
 
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
 static void
-ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	trace_sched_wakeup(p, true);
 	check_preempt_curr(rq, p, wake_flags);
@@ -2492,7 +2495,7 @@ try_to_wake_up(struct task_struct *p, un
 
 	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 out_running:
-	ttwu_post_activation(p, rq, wake_flags);
+	ttwu_do_wakeup(rq, p, wake_flags);
 	success = 1;
 	__task_rq_unlock(rq);
 
@@ -2529,7 +2532,7 @@ static void try_to_wake_up_local(struct 
 	if (!p->on_rq)
 		activate_task(rq, p, ENQUEUE_WAKEUP);
 
-	ttwu_post_activation(p, rq, 0);
+	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (15 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 16/18] sched: Rename ttwu_post_activation Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-05 21:07   ` Oleg Nesterov
  2011-01-04 14:59 ` [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
                   ` (2 subsequent siblings)
  19 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-queue-remote.patch --]
[-- Type: text/plain, Size: 6383 bytes --]

Now that we've removed the rq->lock requirement from the first part of
ttwu() and can compute placement without holding any rq->lock, ensure
we execute the second half of ttwu() on the actual cpu we want the
task to run on.

This avoids having to take rq->lock and doing the task enqueue
remotely, saving lots on cacheline transfers.

As measured using: http://oss.oracle.com/~mason/sembench.c

$ echo 4096 32000 64 128 > /proc/sys/kernel/sem
$ ./sembench -t 2048 -w 1900 -o 0

unpatched: run time 30 seconds 639476 worker burns per second
patched:   run time 30 seconds 847526 worker burns per second

Still need to sort out all the races marked XXX (non-trivial), and its
x86 only for the moment.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/smp.c   |    1 
 include/linux/sched.h   |    2 
 kernel/sched.c          |  139 +++++++++++++++++++++++++++++++++++++-----------
 kernel/sched_features.h |    2 
 4 files changed, 113 insertions(+), 31 deletions(-)

Index: linux-2.6/arch/x86/kernel/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smp.c
+++ linux-2.6/arch/x86/kernel/smp.c
@@ -205,6 +205,7 @@ void smp_reschedule_interrupt(struct pt_
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
+	sched_ttwu_pending();
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1020,6 +1020,7 @@ partition_sched_domains(int ndoms_new, c
 }
 #endif	/* !CONFIG_SMP */
 
+void sched_ttwu_pending(void);
 
 struct io_context;			/* See blkdev.h */
 
@@ -1201,6 +1202,7 @@ struct task_struct {
 	int lock_depth;		/* BKL lock depth */
 
 #ifdef CONFIG_SMP
+	struct task_struct *wake_entry;
 	int on_cpu;
 #endif
 	int on_rq;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -559,6 +559,10 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
+
+#ifdef CONFIG_SMP
+	struct task_struct *wake_list;
+#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2431,6 +2435,96 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	WARN_ON_ONCE(task_cpu(p) != cpu_of(rq));
+
+	if (p->sched_contributes_to_load)
+		rq->nr_uninterruptible--;
+
+	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+	ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	rq = __task_rq_lock(p);
+	if (p->on_rq) {
+		ttwu_do_wakeup(rq, p, wake_flags);
+		ret = 1;
+	}
+	__task_rq_unlock(rq);
+
+	return ret;
+}
+
+void sched_ttwu_pending(void)
+{
+#ifdef CONFIG_SMP
+	struct rq *rq = this_rq();
+	struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+	if (!list)
+		return;
+
+	raw_spin_lock(&rq->lock);
+
+	while (list) {
+		struct task_struct *p = list;
+		list = list->wake_entry;
+		ttwu_do_activate(rq, p, 0);
+	}
+
+	raw_spin_unlock(&rq->lock);
+#endif
+}
+
+#ifdef CONFIG_SMP
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+	struct task_struct *next = NULL;
+	struct rq *rq = cpu_rq(cpu);
+
+	for (;;) {
+		struct task_struct *old = next;
+
+		p->wake_entry = next;
+		next = cmpxchg(&rq->wake_list, old, p);
+		if (next == old)
+			break;
+	}
+
+	if (!next)
+		smp_send_reschedule(cpu);
+}
+#endif
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+#ifdef CONFIG_SMP
+	if (!sched_feat(TTWU_FORCE_REMOTE) && cpu != smp_processor_id()) {
+		ttwu_queue_remote(p, cpu);
+		return;
+	}
+#endif
+
+	raw_spin_lock(&rq->lock);
+	ttwu_do_activate(rq, p, 0);
+	raw_spin_unlock(&rq->lock);
+}
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -2449,60 +2543,43 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-	int cpu, this_cpu, success = 0;
 	unsigned long flags;
-	struct rq *rq;
-
-	this_cpu = get_cpu();
+	int cpu, success = 0;
 
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	if (!(p->state & state))
 		goto out;
 
+	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
-	if (p->on_rq) {
-		rq = __task_rq_lock(p);
-		if (p->on_rq)
-			goto out_running;
-		__task_rq_unlock(rq);
-	}
+	if (p->on_rq && ttwu_remote(p, wake_flags))
+		goto stat;
+
+	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	p->state = TASK_WAKING;
 
 #ifdef CONFIG_SMP
+	/*
+	 * If the owning (remote) cpu is still in the middle of schedule() with
+	 * this task as prev, wait until its done referencing the task.
+	 */
 	while (p->on_cpu)
 		cpu_relax();
 
-	p->sched_contributes_to_load = !!task_contributes_to_load(p);
-	p->state = TASK_WAKING;
-
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
 
 	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+	set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
 
-	rq = cpu_rq(cpu);
-	raw_spin_lock(&rq->lock);
-
-#ifdef CONFIG_SMP
-	if (cpu != task_cpu(p))
-		set_task_cpu(p, cpu);
-
-	if (p->sched_contributes_to_load)
-		rq->nr_uninterruptible--;
-#endif
-
-	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-out_running:
-	ttwu_do_wakeup(rq, p, wake_flags);
-	success = 1;
-	__task_rq_unlock(rq);
-
+	ttwu_queue(p, cpu);
+stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-	put_cpu();
 
 	return success;
 }
Index: linux-2.6/kernel/sched_features.h
===================================================================
--- linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -64,3 +64,5 @@ SCHED_FEAT(OWNER_SPIN, 1)
  * Decrement CPU power based on irq activity
  */
 SCHED_FEAT(NONIRQ_POWER, 1)
+
+SCHED_FEAT(TTWU_FORCE_REMOTE, 0)



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (16 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu Peter Zijlstra
@ 2011-01-04 14:59 ` Peter Zijlstra
  2011-01-05 20:47   ` Oleg Nesterov
  2011-01-04 15:16 ` [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Ingo Molnar
  2011-01-29  1:20 ` Frank Rowand
  19 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:59 UTC (permalink / raw)
  To: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	Yong Zhang
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-ttwu-hotplug.patch --]
[-- Type: text/plain, Size: 1315 bytes --]

On hot-unplug flush the pending wakeup queue by selecting a new rq for
each of them and requeueing them appropriately.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2526,6 +2526,29 @@ static void ttwu_queue(struct task_struc
 	raw_spin_unlock(&rq->lock);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static void ttwu_queue_unplug(struct rq *rq)
+{
+	struct task_struct *p, *list = xchg(&rq->wake_list, NULL);
+	unsigned long flags;
+	int cpu;
+
+	if (!list)
+		return;
+
+	while (list) {
+		p = list;
+		list = list->wake_entry;
+
+		raw_spin_lock_irqsave(&p->pi_lock, flags);
+		cpu = select_task_rq(p, SD_BALANCE_WAKE, 0);
+		set_task_cpu(p, cpu);
+		ttwu_queue(p, cpu);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	}
+}
+#endif
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -6151,6 +6174,11 @@ migration_call(struct notifier_block *nf
 		migrate_nr_uninterruptible(rq);
 		calc_global_load_remove(rq);
 		break;
+
+	case CPU_DEAD:
+		ttwu_queue_unplug(cpu_rq(cpu));
+		break;
+
 #endif
 	}
 	return NOTIFY_OK;



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (17 preceding siblings ...)
  2011-01-04 14:59 ` [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
@ 2011-01-04 15:16 ` Ingo Molnar
  2011-01-29  1:20 ` Frank Rowand
  19 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2011-01-04 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Latest version of the scary patches...

Just to make it easier for others to see the rationale behind this series, see the 
numbers and limitations listed below.

	Ingo

------------------------->
As measured using: http://oss.oracle.com/~mason/sembench.c

$ echo 4096 32000 64 128 > /proc/sys/kernel/sem
$ ./sembench -t 2048 -w 1900 -o 0

unpatched: run time 30 seconds 639476 worker burns per second
patched:   run time 30 seconds 847526 worker burns per second

Still need to sort out all the races marked XXX (non-trivial), and its
x86 only for the moment.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 05/18] sched: Provide p->on_rq
  2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
@ 2011-01-05  8:13   ` Yong Zhang
  2011-01-05  9:53     ` Peter Zijlstra
  2011-01-29  0:10   ` Frank Rowand
  1 sibling, 1 reply; 44+ messages in thread
From: Yong Zhang @ 2011-01-05  8:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 4270 bytes --]

On Tue, Jan 4, 2011 at 10:59 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Provide a generic p->on_rq because the p->se.on_rq semantics are
> unfavourable for lockless wakeups but needed for sched_fair.
>
> In particular, p->on_rq is only cleared when we actually dequeue the
> task in schedule() and not on any random dequeue as done by things
> like __migrate_task() and __sched_setscheduler().
>
> This also allows us to remove p->se usage from !sched_fair code.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/sched.h   |    1 +
>  kernel/sched.c          |   36 ++++++++++++++++++------------------
>  kernel/sched_debug.c    |    2 +-
>  kernel/sched_rt.c       |   10 +++++-----
>  kernel/sched_stoptask.c |    2 +-
>  5 files changed, 26 insertions(+), 25 deletions(-)
>
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -1132,7 +1132,7 @@ static void put_prev_task_rt(struct rq *
>         * The previous task needs to be made eligible for pushing
>         * if it is still active
>         */
> -       if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
> +       if (p->on_rq && p->rt.nr_cpus_allowed > 1)

How about on_rt_rq(&p->rt) here?

Quoted from my previous reply:
[Seems we need on_rt_rq(&p->rt) here, otherwise we enqueue the
task to pushable list when called from rt_mutex_setprio()/
__sched_setscheduler() etc. Thus add a little overhead.
Though we call dequeue_pushable_task() in set_curr_task_rt()
unconditionally.]

Thanks,
Yong

>                enqueue_pushable_task(rq, p);
>  }
>
> @@ -1283,7 +1283,7 @@ static struct rq *find_lock_lowest_rq(st
>                                     !cpumask_test_cpu(lowest_rq->cpu,
>                                                       &task->cpus_allowed) ||
>                                     task_running(rq, task) ||
> -                                    !task->se.on_rq)) {
> +                                    !task->on_rq)) {
>
>                                raw_spin_unlock(&lowest_rq->lock);
>                                lowest_rq = NULL;
> @@ -1317,7 +1317,7 @@ static struct task_struct *pick_next_pus
>        BUG_ON(task_current(rq, p));
>        BUG_ON(p->rt.nr_cpus_allowed <= 1);
>
> -       BUG_ON(!p->se.on_rq);
> +       BUG_ON(!p->on_rq);
>        BUG_ON(!rt_task(p));
>
>        return p;
> @@ -1463,7 +1463,7 @@ static int pull_rt_task(struct rq *this_
>                 */
>                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
>                        WARN_ON(p == src_rq->curr);
> -                       WARN_ON(!p->se.on_rq);
> +                       WARN_ON(!p->on_rq);
>
>                        /*
>                         * There's a chance that p is higher in priority
> @@ -1534,7 +1534,7 @@ static void set_cpus_allowed_rt(struct t
>         * Update the migration status of the RQ if we have an RT task
>         * which is running AND changing its weight value.
>         */
> -       if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
> +       if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
>                struct rq *rq = task_rq(p);
>
>                if (!task_current(rq, p)) {
> Index: linux-2.6/kernel/sched_stoptask.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_stoptask.c
> +++ linux-2.6/kernel/sched_stoptask.c
> @@ -26,7 +26,7 @@ static struct task_struct *pick_next_tas
>  {
>        struct task_struct *stop = rq->stop;
>
> -       if (stop && stop->se.on_rq)
> +       if (stop && stop->on_rq)
>                return stop;
>
>        return NULL;
>
>
>



-- 
Only stand for myself
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 05/18] sched: Provide p->on_rq
  2011-01-05  8:13   ` Yong Zhang
@ 2011-01-05  9:53     ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-05  9:53 UTC (permalink / raw)
  To: Yong Zhang
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Oleg Nesterov, Paul Turner, Jens Axboe,
	linux-kernel, Steven Rostedt

On Wed, 2011-01-05 at 16:13 +0800, Yong Zhang wrote:
> > +       if (p->on_rq && p->rt.nr_cpus_allowed > 1)
> 
> How about on_rt_rq(&p->rt) here?
> 
> Quoted from my previous reply:
> [Seems we need on_rt_rq(&p->rt) here, otherwise we enqueue the
> task to pushable list when called from rt_mutex_setprio()/
> __sched_setscheduler() etc. Thus add a little overhead.
> Though we call dequeue_pushable_task() in set_curr_task_rt()
> unconditionally.] 

Ah, sorry for loosing that reply..

Yes I think that would work, Steven?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-01-04 14:59 ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Peter Zijlstra
@ 2011-01-05 18:46   ` Oleg Nesterov
  2011-01-05 19:33     ` Peter Zijlstra
  2011-01-29  0:21   ` Frank Rowand
  1 sibling, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2011-01-05 18:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On 01/04, Peter Zijlstra wrote:
>
> This makes task_rq_lock() acquire both locks, and have
> __task_rq_lock() validate that p->pi_lock is held.

... and kills task_is_waking(), good ;)

So TASK_WAKING is only means "do not try to wakeup", this greatly
simplifies things.

One purely cosmetic nit,

> @@ -4902,8 +4898,7 @@ static int __sched_setscheduler(struct t
>
>  		check_class_changed(rq, p, prev_class, oldprio, running);
>  	}
> -	__task_rq_unlock(rq);
> -	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +	task_rq_unlock(rq, p, &flags);

__sched_setscheduler() has a couple more instances of

	__task_rq_unlock(rq);
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	return EXXX;

above.

> @@ -5691,8 +5685,7 @@ int set_cpus_allowed_ptr(struct task_str
>  		return 0;
>  	}
>  out:
> -	__task_rq_unlock(rq);
> -	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +	task_rq_unlock(rq, p, &flags);

the same.


Hmm, and normalize_rt_tasks(), it could just do task_rq_lock/task_rq_unlock.
And why it does read_lock_irqsave(tasklist), btw? _irqsave looks unneeded.

Oleg.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-01-05 18:46   ` Oleg Nesterov
@ 2011-01-05 19:33     ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-05 19:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On Wed, 2011-01-05 at 19:46 +0100, Oleg Nesterov wrote:

> One purely cosmetic nit,

Fixed those, thanks!

> Hmm, and normalize_rt_tasks(), it could just do task_rq_lock/task_rq_unlock.
> And why it does read_lock_irqsave(tasklist), btw? _irqsave looks unneeded. 

I'm not quite sure, I think we can make that an RCU iteration of the
taskset as well, no need to even hold the task-lock, it would render the
sysrq button useless in the face of a RT fork bomb, but I'm not sure we
really care about that at all.

Ingo, Thomas?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing
  2011-01-04 14:59 ` [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
@ 2011-01-05 20:47   ` Oleg Nesterov
  2011-01-06 10:56     ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2011-01-05 20:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On 01/04, Peter Zijlstra wrote:
>
> +#ifdef CONFIG_HOTPLUG_CPU
> +static void ttwu_queue_unplug(struct rq *rq)
> +{
> +	struct task_struct *p, *list = xchg(&rq->wake_list, NULL);
> +	unsigned long flags;
> +	int cpu;
> +
> +	if (!list)
> +		return;
> +
> +	while (list) {
> +		p = list;
> +		list = list->wake_entry;
> +
> +		raw_spin_lock_irqsave(&p->pi_lock, flags);
> +		cpu = select_task_rq(p, SD_BALANCE_WAKE, 0);
> +		set_task_cpu(p, cpu);
> +		ttwu_queue(p, cpu);
> +		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +	}
> +}
> +#endif
> +
>  /**
>   * try_to_wake_up - wake up a thread
>   * @p: the thread to be awakened
> @@ -6151,6 +6174,11 @@ migration_call(struct notifier_block *nf
>  		migrate_nr_uninterruptible(rq);
>  		calc_global_load_remove(rq);
>  		break;
> +
> +	case CPU_DEAD:
> +		ttwu_queue_unplug(cpu_rq(cpu));

I think this is not strictly needed...

Afaics, take_cpu_down() can simply call sched_ttwu_pending() at the
start. This will activate the pending tasks on the (almost) dead
cpu but we don't care, they will be migrated later.

When __stop_machine(take_cpu_down) returns nobody can use this CPU
as a target, iow rq->wake_list can't be used again.

Oleg.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-04 14:59 ` [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu Peter Zijlstra
@ 2011-01-05 21:07   ` Oleg Nesterov
  2011-01-06 15:09     ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2011-01-05 21:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On 01/04, Peter Zijlstra wrote:
>
> Now that we've removed the rq->lock requirement from the first part of
> ttwu() and can compute placement without holding any rq->lock, ensure
> we execute the second half of ttwu() on the actual cpu we want the
> task to run on.

Damn. I am reading this patch back and forth, many times, and
I am not able to find any problem. So sad!

I'll try to read it once again with the fresh head, though ;)
I also have a couple of very minor nits... In particular, perhaps
TASK_WAKING can die...

Just one question for today,

>  try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  {
> -	int cpu, this_cpu, success = 0;
>  	unsigned long flags;
> -	struct rq *rq;
> -
> -	this_cpu = get_cpu();
> +	int cpu, success = 0;
>
>  	smp_wmb();
>  	raw_spin_lock_irqsave(&p->pi_lock, flags);
>  	if (!(p->state & state))
>  		goto out;
>
> +	success = 1; /* we're going to change ->state */
>  	cpu = task_cpu(p);
>
> -	if (p->on_rq) {
> -		rq = __task_rq_lock(p);
> -		if (p->on_rq)
> -			goto out_running;
> -		__task_rq_unlock(rq);
> -	}
> +	if (p->on_rq && ttwu_remote(p, wake_flags))
> +		goto stat;
> +
> +	p->sched_contributes_to_load = !!task_contributes_to_load(p);
> +	p->state = TASK_WAKING;
>
>  #ifdef CONFIG_SMP
> +	/*
> +	 * If the owning (remote) cpu is still in the middle of schedule() with
> +	 * this task as prev, wait until its done referencing the task.
> +	 */
>  	while (p->on_cpu)
>  		cpu_relax();

Don't we need rmb() after that?

No, I am not saying it _is_ needed. I am asking.

(but need_migrate_task() can avoid on_cpu+rmb afaics)

Oleg.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing
  2011-01-05 20:47   ` Oleg Nesterov
@ 2011-01-06 10:56     ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-06 10:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On Wed, 2011-01-05 at 21:47 +0100, Oleg Nesterov wrote:
> On 01/04, Peter Zijlstra wrote:
> >
> > +#ifdef CONFIG_HOTPLUG_CPU
> > +static void ttwu_queue_unplug(struct rq *rq)
> > +{
> > +	struct task_struct *p, *list = xchg(&rq->wake_list, NULL);
> > +	unsigned long flags;
> > +	int cpu;
> > +
> > +	if (!list)
> > +		return;
> > +
> > +	while (list) {
> > +		p = list;
> > +		list = list->wake_entry;
> > +
> > +		raw_spin_lock_irqsave(&p->pi_lock, flags);
> > +		cpu = select_task_rq(p, SD_BALANCE_WAKE, 0);
> > +		set_task_cpu(p, cpu);
> > +		ttwu_queue(p, cpu);
> > +		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> > +	}
> > +}
> > +#endif
> > +
> >  /**
> >   * try_to_wake_up - wake up a thread
> >   * @p: the thread to be awakened
> > @@ -6151,6 +6174,11 @@ migration_call(struct notifier_block *nf
> >  		migrate_nr_uninterruptible(rq);
> >  		calc_global_load_remove(rq);
> >  		break;
> > +
> > +	case CPU_DEAD:
> > +		ttwu_queue_unplug(cpu_rq(cpu));
> 
> I think this is not strictly needed...
> 
> Afaics, take_cpu_down() can simply call sched_ttwu_pending() at the
> start. This will activate the pending tasks on the (almost) dead
> cpu but we don't care, they will be migrated later.

At the end, __cpu_disable() is the one clearing all the cpu mask bits.
So we could call it from CPU_DYING I guess.

> When __stop_machine(take_cpu_down) returns nobody can use this CPU
> as a target, iow rq->wake_list can't be used again.

Right, so something like the below should suffice, I've folded that into
patch 17.

---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6129,6 +6129,7 @@ migration_call(struct notifier_block *nf
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DYING:
+		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq()
  2011-01-04 14:59 ` [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
@ 2011-01-06 13:57   ` Peter Zijlstra
  2011-01-06 14:23     ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-06 13:57 UTC (permalink / raw)
  To: Chris Mason
  Cc: Frank Rowand, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel,
	Steven Rostedt

On Tue, 2011-01-04 at 15:59 +0100, Peter Zijlstra wrote:
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -973,11 +973,18 @@ static void yield_task_rt(struct rq *rq)
>  static int find_lowest_rq(struct task_struct *task);
>  
>  static int
> -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
> +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
>  {
>         if (sd_flag != SD_BALANCE_WAKE)
>                 return smp_processor_id();
>  
> +#if 0
> +       /*
> +        * XXX without holding rq->lock the below is racy, need to
> +        * rewrite it in a racy but non-dangerous way so that we mostly
> +        * get the benefit of the heuristic but don't crash the kernel
> +        * if we get it wrong ;-)
> +        */
>         /*
>          * If the current task is an RT task, then
>          * try to see if we can wake this RT task up on another
> @@ -1002,6 +1009,7 @@ select_task_rq_rt(struct rq *rq, struct 
>  
>                 return (cpu == -1) ? task_cpu(p) : cpu;
>         }
> +#endif
>  
>         /*
>          * Otherwise, just let it ride on the affined RQ and the 


How about something like so?

---
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -975,18 +975,21 @@ static int find_lowest_rq(struct task_st
 static int
 select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
+	struct task_struct *curr;
+	struct rq *rq;
+	int cpu;
+
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
-#if 0
-	/*
-	 * XXX without holding rq->lock the below is racy, need to
-	 * rewrite it in a racy but non-dangerous way so that we mostly
-	 * get the benefit of the heuristic but don't crash the kernel
-	 * if we get it wrong ;-)
-	 */
+	cpu = task_cpu(p);
+	rq = cpu_rq(cpu);
+
+	rcu_read_lock();
+	curr = rcu_dereference(rq->curr); /* unlocked access */
+
 	/*
-	 * If the current task is an RT task, then
+	 * If the current task on @p's runqueue is an RT task, then
 	 * try to see if we can wake this RT task up on another
 	 * runqueue. Otherwise simply start this RT task
 	 * on its current runqueue.
@@ -1000,22 +1003,25 @@ select_task_rq_rt(struct task_struct *p,
 	 * lock?
 	 *
 	 * For equal prio tasks, we just let the scheduler sort it out.
+	 *
+	 * Otherwise, just let it ride on the affined RQ and the
+	 * post-schedule router will push the preempted task away
+	 *
+	 * This test is optimistic, if we get it wrong the load-balancer
+	 * will have to sort it out.
 	 */
-	if (unlikely(rt_task(rq->curr)) &&
-	    (rq->curr->rt.nr_cpus_allowed < 2 ||
-	     rq->curr->prio < p->prio) &&
+	if (curr && unlikely(rt_task(curr)) &&
+	    (curr->rt.nr_cpus_allowed < 2 ||
+	     curr->prio < p->prio) &&
 	    (p->rt.nr_cpus_allowed > 1)) {
-		int cpu = find_lowest_rq(p);
+		int target = find_lowest_rq(p);
 
-		return (cpu == -1) ? task_cpu(p) : cpu;
+		if (target != -1)
+			cpu = target;
 	}
-#endif
+	rcu_read_unlock();
 
-	/*
-	 * Otherwise, just let it ride on the affined RQ and the
-	 * post-schedule router will push the preempted task away
-	 */
-	return task_cpu(p);
+	return cpu;
 }
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq()
  2011-01-06 13:57   ` Peter Zijlstra
@ 2011-01-06 14:23     ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-06 14:23 UTC (permalink / raw)
  To: Chris Mason
  Cc: Frank Rowand, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel,
	Steven Rostedt

On Thu, 2011-01-06 at 14:57 +0100, Peter Zijlstra wrote:
> +       curr = rcu_dereference(rq->curr); /* unlocked access */

Hrm, I got my barriers wrong again, I think we can go with ACCESS_ONCE()
here.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-05 21:07   ` Oleg Nesterov
@ 2011-01-06 15:09     ` Peter Zijlstra
  2011-01-07 15:22       ` Oleg Nesterov
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-06 15:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On Wed, 2011-01-05 at 22:07 +0100, Oleg Nesterov wrote:
> On 01/04, Peter Zijlstra wrote:
> >
> > Now that we've removed the rq->lock requirement from the first part of
> > ttwu() and can compute placement without holding any rq->lock, ensure
> > we execute the second half of ttwu() on the actual cpu we want the
> > task to run on.
> 
> Damn. I am reading this patch back and forth, many times, and
> I am not able to find any problem. So sad!

:-)

> I'll try to read it once again with the fresh head, though ;)
> I also have a couple of very minor nits... In particular, perhaps
> TASK_WAKING can die...

I think it might.. I'll do a patch at the end removing it, lets see what
happens.

> Just one question for today,
> 
> >  try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> >  {
> > -	int cpu, this_cpu, success = 0;
> >  	unsigned long flags;
> > -	struct rq *rq;
> > -
> > -	this_cpu = get_cpu();
> > +	int cpu, success = 0;
> >
> >  	smp_wmb();
> >  	raw_spin_lock_irqsave(&p->pi_lock, flags);
> >  	if (!(p->state & state))
> >  		goto out;
> >
> > +	success = 1; /* we're going to change ->state */
> >  	cpu = task_cpu(p);
> >
> > -	if (p->on_rq) {
> > -		rq = __task_rq_lock(p);
> > -		if (p->on_rq)
> > -			goto out_running;
> > -		__task_rq_unlock(rq);
> > -	}
> > +	if (p->on_rq && ttwu_remote(p, wake_flags))
> > +		goto stat;
> > +
> > +	p->sched_contributes_to_load = !!task_contributes_to_load(p);
> > +	p->state = TASK_WAKING;
> >
> >  #ifdef CONFIG_SMP
> > +	/*
> > +	 * If the owning (remote) cpu is still in the middle of schedule() with
> > +	 * this task as prev, wait until its done referencing the task.
> > +	 */
> >  	while (p->on_cpu)
> >  		cpu_relax();
> 
> Don't we need rmb() after that?
> 
> No, I am not saying it _is_ needed. I am asking.

OK, so I've been thinking and all I can say is that I've got a head-ache
and that I _think_ you're right.. while it doesn't matter for the
observance of p->on_cpu itself, we don't want other reads to be done
before we do indeed observe it.

> (but need_migrate_task() can avoid on_cpu+rmb afaics)

I think so too, I'll stick a patch to that effect at the end.

Current queue lives at:
  http://programming.kicks-ass.net/sekrit/patches-ttwu.tar.bz2



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu()
  2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
@ 2011-01-06 16:29   ` Peter Zijlstra
  2011-01-29  1:05   ` Frank Rowand
  1 sibling, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-06 16:29 UTC (permalink / raw)
  To: Chris Mason
  Cc: Frank Rowand, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On Tue, 2011-01-04 at 15:59 +0100, Peter Zijlstra wrote:
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -1343,8 +1343,7 @@ static void task_waking_fair(struct task
>         struct sched_entity *se = &p->se;
>         struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
> -       lockdep_assert_held(&task_rq(p)->lock);
> -
> +       // XXX racy on 32bit
>         se->vruntime -= cfs_rq->min_vruntime;
>  } 


Best I can come up with is something like this:


Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -315,6 +315,9 @@ struct cfs_rq {
 
 	u64 exec_clock;
 	u64 min_vruntime;
+#ifndef CONFIG_64BIT
+	u64 min_vruntime_copy;
+#endif
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -365,6 +365,10 @@ static void update_min_vruntime(struct c
 	}
 
 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 
 /*
@@ -1342,9 +1346,21 @@ static void task_waking_fair(struct task
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 min_vruntime;
 
-	// XXX racy on 32bit
-	se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+	u64 min_vruntime_copy;
+
+	do {
+		min_vruntime_copy = cfs_rq->min_vruntime_copy;
+		smp_rmb();
+		min_vruntime = cfs_rq->min_vruntime;
+	} while (min_vruntime != min_vruntime_copy);
+#else
+	min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+	se->vruntime -= min_vruntime;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-06 15:09     ` Peter Zijlstra
@ 2011-01-07 15:22       ` Oleg Nesterov
  2011-01-18 16:38         ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2011-01-07 15:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On 01/06, Peter Zijlstra wrote:
>
> On Wed, 2011-01-05 at 22:07 +0100, Oleg Nesterov wrote:
>
> > I'll try to read it once again with the fresh head, though ;)
> > I also have a couple of very minor nits... In particular, perhaps
> > TASK_WAKING can die...
>
> I think it might.. I'll do a patch at the end removing it, lets see what
> happens.

Yes, ttwu can just set TASK_RUNNING. But, otoh, perhaps the
special state makes sense anyway, say, it can help to debug
the problems. We can even have TASK_WAKING_CONTRIBUTES_TO_LOAD
insetad of ->sched_contributes_to_load. But this all is very
minor.

A couple of questions...


Why sched_fork() does set_task_cpu() ? Just curious, it seems
that wake_up_new_task() does all we need.


ttwu_queue_remote() does "struct task_struct *next = NULL".
Probably "next = rq->wake_list" makes more sens. Otherwise the
first cmpxchg() always fails if rq->wake_list != NULL.



Doesn't __migrate_task() need pi_lock? Consider:

1. A task T runs on CPU_0, it does set_current_state(TASK_INTERRUBTIBLE)

2. some CPU does set_cpus_allowed_ptr(T, new_mask), new_mask doesn't
   include CPU_0.

   T is running, cpumask_any_and() picks CPU_1, set_cpus_allowed_ptr()
   drops pi_lock and rq->lock before stop_one_cpu().

3. T calls schedule() and becomes deactivated.

4. CPU_2 does try_to_wake_up(T, TASK_INTERRUPTIBLE), takes pi_lock
   and sees on_rq == F.

5. set_cpus_allowed_ptr() resumes and calls stop_one_cpu(cpu => 1).

6. cpu_stopper_thread() runs on CPU_1 and calls ____migrate_task().
   It locks CPU_0 and CPU_1 rq's and checks task_cpu() == src_cpu.

7. CPU_2 calls select_task_rq(), it returns (to simplify) 2.

   Now try_to_wake_up() does set_task_cpu(T, 2), and calls
   ttwu_queue()->ttwu_do_activate()->activate_task().

8. __migrate_task() on CPU_1 sees p->on_rq and starts the
   deactivate/activate dance racing with ttwu_do_activate()
   on CPU_2.



And a final question. This is really, really minor, but
activate_task/deactivate_task are not symmetric, the former
always sets p->on_rq. Looks correct, but imho a bit confusing and
can complicate the understanding. Since p->on_rq is cleared
explicitly by schedule(), perhaps it can be set explicitly to
in try_to_wake_up_*. Or, perhaps, activate/deactivate can check
ENQUEUE_WAKEUP/DEQUEUE_SLEEP and set/clear p->on_rq. Once again,
this is purely cosmetic issue.

Oleg.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-07 15:22       ` Oleg Nesterov
@ 2011-01-18 16:38         ` Peter Zijlstra
  2011-01-19 19:37           ` Oleg Nesterov
  2011-01-29  0:04           ` Frank Rowand
  0 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-01-18 16:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On Fri, 2011-01-07 at 16:22 +0100, Oleg Nesterov wrote:

> Why sched_fork() does set_task_cpu() ? Just curious, it seems
> that wake_up_new_task() does all we need.

The only reason I can come up with is to properly initialize the
data-structures before make the thing visible, by the time
wake_up_new_task() comes along, its already fully visible.

> ttwu_queue_remote() does "struct task_struct *next = NULL".
> Probably "next = rq->wake_list" makes more sens. Otherwise the
> first cmpxchg() always fails if rq->wake_list != NULL.

Indeed, I think Yong mentioned the same a while back.. done.

> Doesn't __migrate_task() need pi_lock? Consider:
> 
> 1. A task T runs on CPU_0, it does set_current_state(TASK_INTERRUBTIBLE)
> 
> 2. some CPU does set_cpus_allowed_ptr(T, new_mask), new_mask doesn't
>    include CPU_0.
> 
>    T is running, cpumask_any_and() picks CPU_1, set_cpus_allowed_ptr()
>    drops pi_lock and rq->lock before stop_one_cpu().
> 
> 3. T calls schedule() and becomes deactivated.
> 
> 4. CPU_2 does try_to_wake_up(T, TASK_INTERRUPTIBLE), takes pi_lock
>    and sees on_rq == F.
>
> 5. set_cpus_allowed_ptr() resumes and calls stop_one_cpu(cpu => 1).
> 
> 6. cpu_stopper_thread() runs on CPU_1 and calls ____migrate_task().
>    It locks CPU_0 and CPU_1 rq's and checks task_cpu() == src_cpu.
> 
> 7. CPU_2 calls select_task_rq(), it returns (to simplify) 2.
> 
>    Now try_to_wake_up() does set_task_cpu(T, 2), and calls
>    ttwu_queue()->ttwu_do_activate()->activate_task().
> 
> 8. __migrate_task() on CPU_1 sees p->on_rq and starts the
>    deactivate/activate dance racing with ttwu_do_activate()
>    on CPU_2.

Drad, yes I think you're right, now you've got me worried about the
other migration paths too.. however did you come up with that
scenario? :-)

A simple fix would be to keep ->pi_lock locked over the call to
stop_one_cpu() from set_cpus_allowed_ptr().

I think the sched_fair.c load-balance code paths are ok because we only
find a task to migrate after we've obtained both runqueue locks, so even
if we migrate current, it cannot schedule (step 3).

I'm not at all sure about the sched_rt load-balance paths, will need to
twist my head around that..


> And a final question. This is really, really minor, but
> activate_task/deactivate_task are not symmetric, the former
> always sets p->on_rq. Looks correct, but imho a bit confusing and
> can complicate the understanding. Since p->on_rq is cleared
> explicitly by schedule(), perhaps it can be set explicitly to
> in try_to_wake_up_*. Or, perhaps, activate/deactivate can check
> ENQUEUE_WAKEUP/DEQUEUE_SLEEP and set/clear p->on_rq. Once again,
> this is purely cosmetic issue.

Right, only because I didn't want to add conditionals and there's two
ENQUEUE_WAKEUP sites and didn't want to replicate the assignment. I'll
fix it up.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-18 16:38         ` Peter Zijlstra
@ 2011-01-19 19:37           ` Oleg Nesterov
  2011-01-29  0:04           ` Frank Rowand
  1 sibling, 0 replies; 44+ messages in thread
From: Oleg Nesterov @ 2011-01-19 19:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Frank Rowand, Ingo Molnar, Thomas Gleixner,
	Mike Galbraith, Paul Turner, Jens Axboe, Yong Zhang,
	linux-kernel

On 01/18, Peter Zijlstra wrote:
>
> On Fri, 2011-01-07 at 16:22 +0100, Oleg Nesterov wrote:
>
> > Why sched_fork() does set_task_cpu() ? Just curious, it seems
> > that wake_up_new_task() does all we need.
>
> The only reason I can come up with is to properly initialize the
> data-structures before make the thing visible, by the time
> wake_up_new_task() comes along, its already fully visible.

Ah, thanks, this makes sense.

> > Doesn't __migrate_task() need pi_lock? Consider:
> >
> > ...
>
> Drad, yes I think you're right, now you've got me worried about the
> other migration paths too.. however did you come up with that
> scenario? :-)

I believe this is the only problem with migration... I mean, I failed
to find something else ;)

> A simple fix would be to keep ->pi_lock locked over the call to
> stop_one_cpu() from set_cpus_allowed_ptr().

I don't think this can work. Suppose that the target CPU spins waiting
for this ->pi_lock.

> I think the sched_fair.c load-balance code paths are ok because we only
> find a task to migrate after we've obtained both runqueue locks, so even
> if we migrate current, it cannot schedule (step 3).
>
> I'm not at all sure about the sched_rt load-balance paths, will need to
> twist my head around that..

I _think_ that sched_fair/rt are fine. At least, at first glance.

The new rules are simple, afaics. set_task_cpu/etc is protected by rq->lock
if the task was not deactivated, otherwise you need task->pi_lock. That is
why I think the things like pull_task() are fine, they are working with
the active tasks and thus they only need the src_rq->lock.

And that is why set_cpus_allowed_ptr()->__migrate_task() "call" looks wrong
in general. The first check in need_migrate_task() is fine, but then we drop
rq->lock. By the time __migrate_task() takes this lock again we can't trust
it, and thus we can't trust "if (p->on_rq)".

Oleg.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-18 16:38         ` Peter Zijlstra
  2011-01-19 19:37           ` Oleg Nesterov
@ 2011-01-29  0:04           ` Frank Rowand
  2011-02-03 17:16             ` Peter Zijlstra
  1 sibling, 1 reply; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  0:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Chris Mason, Rowand, Frank, Ingo Molnar,
	Thomas Gleixner, Mike Galbraith, Paul Turner, Jens Axboe,
	Yong Zhang, linux-kernel

On 01/18/11 08:38, Peter Zijlstra wrote:
> On Fri, 2011-01-07 at 16:22 +0100, Oleg Nesterov wrote:

>> Doesn't __migrate_task() need pi_lock? Consider:

In the reply to this Peter suggests:

"keep ->pi_lock locked over the call to
stop_one_cpu() from set_cpus_allowed_ptr()"

Then Oleg replies to Peter with a possible problem to that.

If I understand Oleg's original suggestion, it solves the problem.
I inject the "pi_lock in __migrate_task()" in Oleg's original problem
description below, and how I think it fixes the problem:

>>
>> 1. A task T runs on CPU_0, it does set_current_state(TASK_INTERRUBTIBLE)
>>
>> 2. some CPU does set_cpus_allowed_ptr(T, new_mask), new_mask doesn't
>>    include CPU_0.
>>
>>    T is running, cpumask_any_and() picks CPU_1, set_cpus_allowed_ptr()
>>    drops pi_lock and rq->lock before stop_one_cpu().
>>
>> 3. T calls schedule() and becomes deactivated.
>>
>> 4. CPU_2 does try_to_wake_up(T, TASK_INTERRUPTIBLE), takes pi_lock
>>    and sees on_rq == F.
>>
>> 5. set_cpus_allowed_ptr() resumes and calls stop_one_cpu(cpu => 1).
>>
>> 6. cpu_stopper_thread() runs on CPU_1 and calls ____migrate_task().

      It attempts to lock p->pi_lock (and thus blocks on lock held by
      try_to_wake_up())

      So do not get to this double rq lock yet:
>>    It locks CPU_0 and CPU_1 rq's and checks task_cpu() == src_cpu.


>>
>> 7. CPU_2 calls select_task_rq(), it returns (to simplify) 2.
>>
>>    Now try_to_wake_up() does set_task_cpu(T, 2), and calls
>>    ttwu_queue()->ttwu_do_activate()->activate_task().

 7.1  Now __migrate_task() gets the p->pi_lock that it was blocked on,
      continues on to get the double rq lock (the last part of step 6
      that got postponed above), discovers that
      (task_cpu(p) != src_cpu), and thus skips over the problematic
      step 8:

>>
>> 8. __migrate_task() on CPU_1 sees p->on_rq and starts the
>>    deactivate/activate dance racing with ttwu_do_activate()
>>    on CPU_2.
> 
> Drad, yes I think you're right, now you've got me worried about the
> other migration paths too.. however did you come up with that
> scenario? :-)
> 
> A simple fix would be to keep ->pi_lock locked over the call to
> stop_one_cpu() from set_cpus_allowed_ptr().
> 
> I think the sched_fair.c load-balance code paths are ok because we only
> find a task to migrate after we've obtained both runqueue locks, so even
> if we migrate current, it cannot schedule (step 3).
> 
> I'm not at all sure about the sched_rt load-balance paths, will need to
> twist my head around that..

I haven't yet tried to twist my head around either the sched_fair or the
sched_rt load balance paths.  But wouldn't it just be safer (especially
given that the load balance code will be modified by somebody at some
point in the future, and that this locking complexity does require head
twisting) to just add the pi_lock in the load-balance paths also?

-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 05/18] sched: Provide p->on_rq
  2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
  2011-01-05  8:13   ` Yong Zhang
@ 2011-01-29  0:10   ` Frank Rowand
  1 sibling, 0 replies; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  0:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 01/04/11 06:59, Peter Zijlstra wrote:
Provide a generic p->on_rq because the p->se.on_rq semantics are
> unfavourable for lockless wakeups but needed for sched_fair.
>
> In particular, p->on_rq is only cleared when we actually dequeue the
> task in schedule() and not on any random dequeue as done by things
> like __migrate_task() and __sched_setscheduler().
>
> This also allows us to remove p->se usage from !sched_fair code.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/sched.h   |    1 +
>  kernel/sched.c          |   36 ++++++++++++++++++------------------
>  kernel/sched_debug.c    |    2 +-
>  kernel/sched_rt.c       |   10 +++++-----
>  kernel/sched_stoptask.c |    2 +-
>  5 files changed, 26 insertions(+), 25 deletions(-)

> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c


> @@ -2571,18 +2570,20 @@ int wake_up_state(struct task_struct *p,
>   */
>  static void __sched_fork(struct task_struct *p)
>  {
> +     p->on_rq                                = 0;

Nitpick: column alignment of "= 0" ^^^^.


> +
> +     p->se.on_rq                     = 0;
>       p->se.exec_start                = 0;
>       p->se.sum_exec_runtime          = 0;
>       p->se.prev_sum_exec_runtime     = 0;
>       p->se.nr_migrations             = 0;
> +     INIT_LIST_HEAD(&p->se.group_node);
>
>  #ifdef CONFIG_SCHEDSTATS
>       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
>  #endif
>
>       INIT_LIST_HEAD(&p->rt.run_list);
> -     p->se.on_rq = 0;
> -     INIT_LIST_HEAD(&p->se.group_node);
>
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>       INIT_HLIST_HEAD(&p->preempt_notifiers);
>


-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-01-04 14:59 ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Peter Zijlstra
  2011-01-05 18:46   ` Oleg Nesterov
@ 2011-01-29  0:21   ` Frank Rowand
  2011-02-03 17:16     ` Peter Zijlstra
  1 sibling, 1 reply; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  0:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 01/04/11 06:59, Peter Zijlstra wrote:
> In order to be able to call set_task_cpu() while either holding
> p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
> stabilize task_rq().
> 
> This makes task_rq_lock() acquire both locks, and have
> __task_rq_lock() validate that p->pi_lock is held. This increases the
> locking overhead for most scheduler syscalls but allows reduction of
> rq->lock contention for some scheduler hot paths (ttwu).
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  kernel/sched.c |   81 ++++++++++++++++++++++++++-------------------------------
>  1 file changed, 37 insertions(+), 44 deletions(-)
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> 

> @@ -980,10 +972,13 @@ static void __task_rq_unlock(struct rq *
>  	raw_spin_unlock(&rq->lock);
>  }
>  
> -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
> +static inline void
> +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
>  	__releases(rq->lock)
> +	__releases(p->pi_lock)
>  {
> -	raw_spin_unlock_irqrestore(&rq->lock, *flags);
> +	raw_spin_unlock(&rq->lock);
> +	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
>  }
>  
>  /*

Most of the callers of task_rq_unlock() were also fixed up to reflect
the newly added parameter "*p", but a couple were missed.  By the end
of the patch series that is ok because the couple that were missed
get removed in patches 12 and 13.  But if you want the patch series
to be bisectable (which I think it is otherwise), you might want to
fix those last couple of callers of task_rq_unlock() in this patch.


> @@ -2646,9 +2647,9 @@ void sched_fork(struct task_struct *p, i
>          *
>          * Silence PROVE_RCU.
>          */
> -       rcu_read_lock();
> +       raw_spin_lock_irqsave(&p->pi_lock, flags);
>         set_task_cpu(p, cpu);
> -       rcu_read_unlock();
> +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);

Does "* Silence PROVE_RCU." no longer apply after remove rcu_read_lock() and
rcu_read_unlock()?

-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu()
  2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
  2011-01-06 16:29   ` Peter Zijlstra
@ 2011-01-29  1:05   ` Frank Rowand
  2011-02-03 17:16     ` Peter Zijlstra
  1 sibling, 1 reply; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  1:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 01/04/11 06:59, Peter Zijlstra wrote:
> Currently ttwu() does two rq->lock acquisitions, once on the task's
> old rq, holding it over the p->state fiddling and load-balance pass.
> Then it drops the old rq->lock to acquire the new rq->lock.
> 
> By having serialized ttwu(), p->sched_class, p->cpus_allowed with
> p->pi_lock, we can now drop the whole first rq->lock acquisition.
> 
> The p->pi_lock serializing concurrent ttwu() calls protects p->state,
> which we will set to TASK_WAKING to bridge possible p->pi_lock to
> rq->lock gaps and serialize set_task_cpu() calls against
> task_rq_lock().
> 
> The p->pi_lock serialization of p->sched_class allows us to call
> scheduling class methods without holding the rq->lock, and the
> serialization of p->cpus_allowed allows us to do the load-balancing
> bits without races.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  kernel/sched.c      |   47 +++++++++++++++++++----------------------------
>  kernel/sched_fair.c |    3 +--
>  2 files changed, 20 insertions(+), 30 deletions(-)
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -2436,69 +2436,60 @@ ttwu_post_activation(struct task_struct 
>   * Returns %true if @p was woken up, %false if it was already running
>   * or @state didn't match @p's state.
>   */
> -static int try_to_wake_up(struct task_struct *p, unsigned int state,
> -			  int wake_flags)
> +static int
> +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  {
> -	int cpu, orig_cpu, this_cpu, success = 0;
> +	int cpu, this_cpu, success = 0;
>  	unsigned long flags;
> -	unsigned long en_flags = ENQUEUE_WAKEUP;
>  	struct rq *rq;
>  
>  	this_cpu = get_cpu();
>  
>  	smp_wmb();
>  	raw_spin_lock_irqsave(&p->pi_lock, flags);
> -	rq = __task_rq_lock(p);
>  	if (!(p->state & state))
>  		goto out;
>  
>  	cpu = task_cpu(p);
>  
> -	if (p->on_rq)
> -		goto out_running;
> +	if (p->on_rq) {
> +		rq = __task_rq_lock(p);
> +		if (p->on_rq)
> +			goto out_running;
> +		__task_rq_unlock(rq);
> +	}
>  
> -	orig_cpu = cpu;
>  #ifdef CONFIG_SMP
> -	if (unlikely(task_running(rq, p)))
> -		goto out_activate;

I think this while (p->on_cpu) can lead to a deadlock.  I'll explain at the
bottom of this email.

> +	while (p->on_cpu)
> +		cpu_relax();


>  
>  	p->sched_contributes_to_load = !!task_contributes_to_load(p);
>  	p->state = TASK_WAKING;
>  
> -	if (p->sched_class->task_waking) {
> +	if (p->sched_class->task_waking)
>  		p->sched_class->task_waking(p);
> -		en_flags |= ENQUEUE_WAKING;
> -	}
>  
>  	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
> -	if (cpu != orig_cpu)
> -		set_task_cpu(p, cpu);
> -	__task_rq_unlock(rq);
> +#endif /* CONFIG_SMP */
>  
>  	rq = cpu_rq(cpu);
>  	raw_spin_lock(&rq->lock);
>  
> -	/*
> -	 * We migrated the task without holding either rq->lock, however
> -	 * since the task is not on the task list itself, nobody else
> -	 * will try and migrate the task, hence the rq should match the
> -	 * cpu we just moved it to.
> -	 */
> -	WARN_ON(task_cpu(p) != cpu);
> -	WARN_ON(p->state != TASK_WAKING);
> +#ifdef CONFIG_SMP
> +	if (cpu != task_cpu(p))
> +		set_task_cpu(p, cpu);
>  
>  	if (p->sched_contributes_to_load)
>  		rq->nr_uninterruptible--;
> +#endif
>  
> -out_activate:
> -#endif /* CONFIG_SMP */
> -	activate_task(rq, p, en_flags);
> +	activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
>  out_running:
>  	ttwu_post_activation(p, rq, wake_flags);
>  	ttwu_stat(rq, p, cpu, wake_flags);
>  	success = 1;
> -out:
>  	__task_rq_unlock(rq);
> +out:
>  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
>  	put_cpu();

The deadlock can occur if __ARCH_WANT_UNLOCKED_CTXSW and
__ARCH_WANT_INTERRUPTS_ON_CTXSW are defined.

A task sets p->state = TASK_UNINTERRUPTIBLE, then calls schedule().

schedule()
   prev->on_rq = 0
   context_switch()
      prepare_task_switch()
         prepare_lock_switch()
            raw_spin_unlock_irq(&rq->lock)

At this point, a pending interrupt (on this same cpu) is handled.
The interrupt handling results in a call to try_to_wake_up() on the
current process.  The try_to_wake_up() gets into:

   while (p->on_cpu)
      cpu_relax();

and spins forever.  This is because "prev->on_cpu = 0" slightly
after this point at:

   finish_task_switch()
      finish_lock_switch()
         prev->on_cpu = 0


One possible fix would be to get rid of __ARCH_WANT_INTERRUPTS_ON_CTXSW.
I don't suspect the reaction to that suggestion will be very positive...

Another fix might be:

   while (p->on_cpu) {
      if (p == current)
         goto out_activate;
      cpu_relax();
      }

   Then add back in the out_activate label.

I don't know if the second fix is good -- I haven't thought out how
it impacts the later patches in the series.

-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 16/18] sched: Rename ttwu_post_activation
  2011-01-04 14:59 ` [RFC][PATCH 16/18] sched: Rename ttwu_post_activation Peter Zijlstra
@ 2011-01-29  1:08   ` Frank Rowand
  0 siblings, 0 replies; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  1:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 01/04/11 06:59, Peter Zijlstra wrote:
> The ttwu_post_actication() does the core wakeup, it sets TASK_RUNNING
> and performs wakeup-preemption, so give is a more descriptive name.

s/ttwu_post_actication/ttwu_post_activation/

-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4
  2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
                   ` (18 preceding siblings ...)
  2011-01-04 15:16 ` [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Ingo Molnar
@ 2011-01-29  1:20 ` Frank Rowand
  19 siblings, 0 replies; 44+ messages in thread
From: Frank Rowand @ 2011-01-29  1:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 01/04/11 06:59, Peter Zijlstra wrote:
> Latest version of the scary patches...

For patches 1 - 17:

Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>

I have a few more cosmetic comments, but I'll hold off on those until
the next version of your patches, so I can put them in the proper
context of your active work instead of ancient history.

I'm also still poking some more at patch 17, being carefully paranoid.

Patch 18 I did not review yet.

-Frank


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-01-29  0:21   ` Frank Rowand
@ 2011-02-03 17:16     ` Peter Zijlstra
  2011-02-03 17:49       ` Frank Rowand
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2011-02-03 17:16 UTC (permalink / raw)
  To: frank.rowand
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On Fri, 2011-01-28 at 16:21 -0800, Frank Rowand wrote:
> On 01/04/11 06:59, Peter Zijlstra wrote:
> > In order to be able to call set_task_cpu() while either holding
> > p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
> > stabilize task_rq().
> > 
> > This makes task_rq_lock() acquire both locks, and have
> > __task_rq_lock() validate that p->pi_lock is held. This increases the
> > locking overhead for most scheduler syscalls but allows reduction of
> > rq->lock contention for some scheduler hot paths (ttwu).
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  kernel/sched.c |   81 ++++++++++++++++++++++++++-------------------------------
> >  1 file changed, 37 insertions(+), 44 deletions(-)
> > 
> > Index: linux-2.6/kernel/sched.c
> > ===================================================================
> > 
> 
> > @@ -980,10 +972,13 @@ static void __task_rq_unlock(struct rq *
> >  	raw_spin_unlock(&rq->lock);
> >  }
> >  
> > -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
> > +static inline void
> > +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
> >  	__releases(rq->lock)
> > +	__releases(p->pi_lock)
> >  {
> > -	raw_spin_unlock_irqrestore(&rq->lock, *flags);
> > +	raw_spin_unlock(&rq->lock);
> > +	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
> >  }
> >  
> >  /*
> 
> Most of the callers of task_rq_unlock() were also fixed up to reflect
> the newly added parameter "*p", but a couple were missed.  By the end
> of the patch series that is ok because the couple that were missed
> get removed in patches 12 and 13.  But if you want the patch series
> to be bisectable (which I think it is otherwise), you might want to
> fix those last couple of callers of task_rq_unlock() in this patch.

Fixed those up indeed, thanks!

> 
> > @@ -2646,9 +2647,9 @@ void sched_fork(struct task_struct *p, i
> >          *
> >          * Silence PROVE_RCU.
> >          */
> > -       rcu_read_lock();
> > +       raw_spin_lock_irqsave(&p->pi_lock, flags);
> >         set_task_cpu(p, cpu);
> > -       rcu_read_unlock();
> > +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> 
> Does "* Silence PROVE_RCU." no longer apply after remove rcu_read_lock() and
> rcu_read_unlock()?

I think the locking is still strictly superfluous, I can't seem to
recollect why I changed it from RCU to pi_lock, but since the task is
fresh and unhashed it really cannot be subject to concurrency.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu()
  2011-01-29  1:05   ` Frank Rowand
@ 2011-02-03 17:16     ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-02-03 17:16 UTC (permalink / raw)
  To: frank.rowand
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On Fri, 2011-01-28 at 17:05 -0800, Frank Rowand wrote:
> 
> The deadlock can occur if __ARCH_WANT_UNLOCKED_CTXSW and
> __ARCH_WANT_INTERRUPTS_ON_CTXSW are defined.
> 
> A task sets p->state = TASK_UNINTERRUPTIBLE, then calls schedule().
> 
> schedule()
>    prev->on_rq = 0
>    context_switch()
>       prepare_task_switch()
>          prepare_lock_switch()
>             raw_spin_unlock_irq(&rq->lock)
> 
> At this point, a pending interrupt (on this same cpu) is handled.
> The interrupt handling results in a call to try_to_wake_up() on the
> current process.  The try_to_wake_up() gets into:
> 
>    while (p->on_cpu)
>       cpu_relax();
> 
> and spins forever.  This is because "prev->on_cpu = 0" slightly
> after this point at:
> 
>    finish_task_switch()
>       finish_lock_switch()
>          prev->on_cpu = 0

Right, very good spot!

> 
> One possible fix would be to get rid of __ARCH_WANT_INTERRUPTS_ON_CTXSW.
> I don't suspect the reaction to that suggestion will be very positive...

:-), afaik some architectures requires this, ie. removing this would
require dropping whole architectures.

> Another fix might be:
> 
>    while (p->on_cpu) {
>       if (p == current)
>          goto out_activate;
>       cpu_relax();
>       }
> 
>    Then add back in the out_activate label.
> 
> I don't know if the second fix is good -- I haven't thought out how
> it impacts the later patches in the series.

Right, I've done something similar to this, simply short-circuit the cpu
selection to force it to activate the task on the local cpu.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu
  2011-01-29  0:04           ` Frank Rowand
@ 2011-02-03 17:16             ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2011-02-03 17:16 UTC (permalink / raw)
  To: frank.rowand
  Cc: Oleg Nesterov, Chris Mason, Rowand, Frank, Ingo Molnar,
	Thomas Gleixner, Mike Galbraith, Paul Turner, Jens Axboe,
	Yong Zhang, linux-kernel

On Fri, 2011-01-28 at 16:04 -0800, Frank Rowand wrote:
> 
> I haven't yet tried to twist my head around either the sched_fair or the
> sched_rt load balance paths.  But wouldn't it just be safer (especially
> given that the load balance code will be modified by somebody at some
> point in the future, and that this locking complexity does require head
> twisting) to just add the pi_lock in the load-balance paths also? 

I don't think that's needed, and I'm generally hesitant to add atomics
where not strictly needed.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
  2011-02-03 17:16     ` Peter Zijlstra
@ 2011-02-03 17:49       ` Frank Rowand
  0 siblings, 0 replies; 44+ messages in thread
From: Frank Rowand @ 2011-02-03 17:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Mason, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Oleg Nesterov, Paul Turner, Jens Axboe, Yong Zhang, linux-kernel

On 02/03/11 09:16, Peter Zijlstra wrote:
> On Fri, 2011-01-28 at 16:21 -0800, Frank Rowand wrote:
>> On 01/04/11 06:59, Peter Zijlstra wrote:

< snip >

>>> @@ -2646,9 +2647,9 @@ void sched_fork(struct task_struct *p, i
>>>          *
>>>          * Silence PROVE_RCU.
>>>          */
>>> -       rcu_read_lock();
>>> +       raw_spin_lock_irqsave(&p->pi_lock, flags);
>>>         set_task_cpu(p, cpu);
>>> -       rcu_read_unlock();
>>> +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
>>
>> Does "* Silence PROVE_RCU." no longer apply after remove rcu_read_lock() and
>> rcu_read_unlock()?
> 
> I think the locking is still strictly superfluous, I can't seem to
> recollect why I changed it from RCU to pi_lock, but since the task is
> fresh and unhashed it really cannot be subject to concurrency.

Sorry, my comment was not very clear.  I meant to ask: should the
comment "* Silence PROVE_RCU." also be removed?

-Frank

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2011-02-03 17:49 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 01/18] sched: Always provide p->on_cpu Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 02/18] mutex: Use p->on_cpu for the adaptive spin Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 03/18] sched: Change the ttwu success details Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 04/18] sched: Clean up ttwu stats Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
2011-01-05  8:13   ` Yong Zhang
2011-01-05  9:53     ` Peter Zijlstra
2011-01-29  0:10   ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 06/18] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
2011-01-06 13:57   ` Peter Zijlstra
2011-01-06 14:23     ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 08/18] sched: Remove rq argument to sched_class::task_waking() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 09/18] sched: Delay task_contributes_to_load() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 10/18] sched: Also serialize ttwu_local() with p->pi_lock Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Peter Zijlstra
2011-01-05 18:46   ` Oleg Nesterov
2011-01-05 19:33     ` Peter Zijlstra
2011-01-29  0:21   ` Frank Rowand
2011-02-03 17:16     ` Peter Zijlstra
2011-02-03 17:49       ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 12/18] sched: Drop rq->lock from first part of wake_up_new_task() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 13/18] sched: Drop rq->lock from sched_exec() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
2011-01-06 16:29   ` Peter Zijlstra
2011-01-29  1:05   ` Frank Rowand
2011-02-03 17:16     ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 15/18] sched: Remove rq argument from ttwu_stat() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 16/18] sched: Rename ttwu_post_activation Peter Zijlstra
2011-01-29  1:08   ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu Peter Zijlstra
2011-01-05 21:07   ` Oleg Nesterov
2011-01-06 15:09     ` Peter Zijlstra
2011-01-07 15:22       ` Oleg Nesterov
2011-01-18 16:38         ` Peter Zijlstra
2011-01-19 19:37           ` Oleg Nesterov
2011-01-29  0:04           ` Frank Rowand
2011-02-03 17:16             ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
2011-01-05 20:47   ` Oleg Nesterov
2011-01-06 10:56     ` Peter Zijlstra
2011-01-04 15:16 ` [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Ingo Molnar
2011-01-29  1:20 ` Frank Rowand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).