All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/4] sched,numa: pinned task accounting
@ 2015-05-15 15:43 Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-15 15:43 UTC (permalink / raw)
  To: mingo, riel; +Cc: dedekind1, linux-kernel, mgorman, peterz, rostedt, juri.lelli

Hi,

Here's a first few patches that provide pinned task accounting; they are boot
tested only so far.

I don't think this is enough to address Artem's regression, but its a
foundation to add some more smarts. In particular we should make it harder
still to migrate well placed tasks away due to these pinned tasks.

Rostedt, Juri, please double check what I did to your resp. set_cpus_allowed
methods.

---
 include/linux/kthread.h  |  1 +
 include/linux/sched.h    |  7 -----
 kernel/kthread.c         | 20 ++++++++++++---
 kernel/sched/core.c      | 66 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/deadline.c  | 35 ++-----------------------
 kernel/sched/fair.c      | 43 +++++++++++++++++++++++++------
 kernel/sched/idle_task.c |  1 +
 kernel/sched/rt.c        | 41 +-----------------------------
 kernel/sched/sched.h     |  3 +++
 kernel/sched/stop_task.c |  1 +
 kernel/workqueue.c       | 11 ++------
 11 files changed, 121 insertions(+), 108 deletions(-)


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
@ 2015-05-15 15:43 ` Peter Zijlstra
  2015-05-15 15:56   ` Tejun Heo
  2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-15 15:43 UTC (permalink / raw)
  To: mingo, riel
  Cc: dedekind1, linux-kernel, mgorman, peterz, rostedt, juri.lelli,
	Tejun Heo, Oleg Nesterov

[-- Attachment #1: peterz-sched-kthread_bind-vs-sched_setaffinity.patch --]
[-- Type: text/plain, Size: 6503 bytes --]

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it.

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the issue by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of
task_struct::{nr_,}cpus_allowed.

Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/kthread.h |    1 +
 include/linux/sched.h   |    7 -------
 kernel/kthread.c        |   21 ++++++++++++++++++---
 kernel/sched/core.c     |   30 ++++++++++++++++++++++++++----
 kernel/workqueue.c      |   11 ++---------
 5 files changed, 47 insertions(+), 23 deletions(-)

--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle(
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-	return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
+	unsigned long flags;
+
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
+
 	/* It's safe because the task is inactive. */
-	do_set_cpus_allowed(p, cpumask_of(cpu));
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+	__kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 
 /**
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4044,6 +4044,9 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pi
 	return retval;
 }
 
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check);
+
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
@@ -4110,7 +4113,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -4638,7 +4641,8 @@ void init_idle(struct task_struct *idle,
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
@@ -4664,7 +4668,8 @@ void init_idle(struct task_struct *idle,
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
@@ -4788,6 +4793,8 @@ static struct rq *move_queued_task(struc
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -4818,7 +4825,8 @@ void do_set_cpus_allowed(struct task_str
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -4827,6 +4835,15 @@ int set_cpus_allowed_ptr(struct task_str
 
 	rq = task_rq_lock(p, &flags);
 
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
@@ -4856,6 +4873,11 @@ int set_cpus_allowed_ptr(struct task_str
 
 	return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 /*
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1619,11 +1619,7 @@ static void worker_attach_to_pool(struct
 {
 	mutex_lock(&pool->attach_mutex);
 
-	/*
-	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-	 * online CPUs.  It'll be re-applied when any of the CPUs come up.
-	 */
-	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+	kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
 	/*
 	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -1708,9 +1704,6 @@ static struct worker *create_worker(stru
 
 	set_user_nice(worker->task, pool->attrs->nice);
 
-	/* prevent userland from meddling with cpumask of workqueue workers */
-	worker->task->flags |= PF_NO_SETAFFINITY;
-
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
 
@@ -3832,7 +3825,7 @@ struct workqueue_struct *__alloc_workque
 		}
 
 		wq->rescuer = rescuer;
-		rescuer->task->flags |= PF_NO_SETAFFINITY;
+		kthread_bind_mask(rescuer->task, cpu_possible_mask);
 		wake_up_process(rescuer->task);
 	}
 



^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional
  2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
@ 2015-05-15 15:43 ` Peter Zijlstra
  2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
  2015-08-20 16:45   ` [RFC][PATCH 2/4] " Sasha Levin
  2015-05-15 15:43 ` [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-15 15:43 UTC (permalink / raw)
  To: mingo, riel; +Cc: dedekind1, linux-kernel, mgorman, peterz, rostedt, juri.lelli

[-- Attachment #1: peterz-sched-affinity.patch --]
[-- Type: text/plain, Size: 4346 bytes --]

Give every class a set_cpus_allowed() method, this enables some small
optimization in the rt,dl implementation by avoiding a double
cpumask_weight() call.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c      |   17 +++++++++++------
 kernel/sched/deadline.c  |   20 ++++++++++++--------
 kernel/sched/fair.c      |    1 +
 kernel/sched/idle_task.c |    1 +
 kernel/sched/rt.c        |   12 ++++++++----
 kernel/sched/sched.h     |    2 ++
 kernel/sched/stop_task.c |    1 +
 7 files changed, 36 insertions(+), 18 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4791,17 +4791,22 @@ static struct rq *move_queued_task(struc
 	return rq;
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-	lockdep_assert_held(&p->pi_lock);
-
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
 	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	lockdep_assert_held(&p->pi_lock);
+	p->sched_class->set_cpus_allowed(p, new_mask);
+}
+
 /*
  * This is how migration works:
  *
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1597,13 +1597,6 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	/*
-	 * Update only if the task is actually running (i.e.,
-	 * it is on the rq AND it is not throttled).
-	 */
-	if (!on_dl_rq(&p->dl))
-		return;
-
 	weight = cpumask_weight(new_mask);
 
 	/*
@@ -1611,7 +1604,14 @@ static void set_cpus_allowed_dl(struct t
 	 * can migrate or not.
 	 */
 	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		return;
+		goto done;
+
+	/*
+	 * Update only if the task is actually running (i.e.,
+	 * it is on the rq AND it is not throttled).
+	 */
+	if (!on_dl_rq(&p->dl))
+		goto done;
 
 	/*
 	 * The process used to be able to migrate OR it can now migrate
@@ -1628,6 +1628,10 @@ static void set_cpus_allowed_dl(struct t
 	}
 
 	update_dl_migration(&rq->dl);
+
+done:
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8406,6 +8406,7 @@ const struct sched_class fair_sched_clas
 	.rq_offline		= rq_offline_fair,
 
 	.task_waking		= task_waking_fair,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_fair,
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_clas
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2065,9 +2065,6 @@ static void set_cpus_allowed_rt(struct t
 
 	BUG_ON(!rt_task(p));
 
-	if (!task_on_rq_queued(p))
-		return;
-
 	weight = cpumask_weight(new_mask);
 
 	/*
@@ -2075,7 +2072,10 @@ static void set_cpus_allowed_rt(struct t
 	 * can migrate or not.
 	 */
 	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		return;
+		goto done;
+
+	if (!task_on_rq_queued(p))
+		goto done;
 
 	rq = task_rq(p);
 
@@ -2094,6 +2094,10 @@ static void set_cpus_allowed_rt(struct t
 	}
 
 	update_rt_migration(&rq->rt);
+
+done:
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1252,6 +1252,8 @@ extern void trigger_load_balance(struct
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
 #else
 
 static inline void idle_enter_fair(struct rq *rq) { }
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_clas
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_stop,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_stop,



^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context
  2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional Peter Zijlstra
@ 2015-05-15 15:43 ` Peter Zijlstra
       [not found]   ` <OF66BF3765.2EBFD3B1-ON48257E49.0028DC79-48257E49.0029F058@zte.com.cn>
  2015-08-12 12:38   ` [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) " tip-bot for Peter Zijlstra
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
  2015-05-18  9:08 ` [RFC][PATCH 0/4] sched,numa: pinned task accounting Artem Bityutskiy
  4 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-15 15:43 UTC (permalink / raw)
  To: mingo, riel; +Cc: dedekind1, linux-kernel, mgorman, peterz, rostedt, juri.lelli

[-- Attachment #1: peterz-sched-set_cpus_allowed-context.patch --]
[-- Type: text/plain, Size: 4135 bytes --]

Change the calling context of sched_class::set_cpus_allowed() such
that we can assume the task is inactive.

This allows us to easily make changes that affect accounting done by
enqueue/dequeue. This does in fact completely remove
set_cpus_allowed_rt and greatly reduces set_cpus_allowed_dl.


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   23 +++++++++++++++++++++++
 kernel/sched/deadline.c |   39 ++-------------------------------------
 kernel/sched/rt.c       |   45 +--------------------------------------------
 3 files changed, 26 insertions(+), 81 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4803,8 +4803,31 @@ void set_cpus_allowed_common(struct task
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
 	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+		dequeue_task(rq, p, 0);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
 	p->sched_class->set_cpus_allowed(p, new_mask);
+
+	if (running)
+		p->sched_class->set_curr_task(rq);
+	if (queued)
+		enqueue_task(rq, p, 0);
 }
 
 /*
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1569,9 +1569,8 @@ static void task_woken_dl(struct rq *rq,
 static void set_cpus_allowed_dl(struct task_struct *p,
 				const struct cpumask *new_mask)
 {
-	struct rq *rq;
 	struct root_domain *src_rd;
-	int weight;
+	struct rq *rq;
 
 	BUG_ON(!dl_task(p));
 
@@ -1597,41 +1596,7 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	weight = cpumask_weight(new_mask);
-
-	/*
-	 * Only update if the process changes its state from whether it
-	 * can migrate or not.
-	 */
-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		goto done;
-
-	/*
-	 * Update only if the task is actually running (i.e.,
-	 * it is on the rq AND it is not throttled).
-	 */
-	if (!on_dl_rq(&p->dl))
-		goto done;
-
-	/*
-	 * The process used to be able to migrate OR it can now migrate
-	 */
-	if (weight <= 1) {
-		if (!task_current(rq, p))
-			dequeue_pushable_dl_task(rq, p);
-		BUG_ON(!rq->dl.dl_nr_migratory);
-		rq->dl.dl_nr_migratory--;
-	} else {
-		if (!task_current(rq, p))
-			enqueue_pushable_dl_task(rq, p);
-		rq->dl.dl_nr_migratory++;
-	}
-
-	update_dl_migration(&rq->dl);
-
-done:
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = weight;
+	set_cpus_allowed_common(p, new_mask);
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2057,49 +2057,6 @@ static void task_woken_rt(struct rq *rq,
 		push_rt_tasks(rq);
 }
 
-static void set_cpus_allowed_rt(struct task_struct *p,
-				const struct cpumask *new_mask)
-{
-	struct rq *rq;
-	int weight;
-
-	BUG_ON(!rt_task(p));
-
-	weight = cpumask_weight(new_mask);
-
-	/*
-	 * Only update if the process changes its state from whether it
-	 * can migrate or not.
-	 */
-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		goto done;
-
-	if (!task_on_rq_queued(p))
-		goto done;
-
-	rq = task_rq(p);
-
-	/*
-	 * The process used to be able to migrate OR it can now migrate
-	 */
-	if (weight <= 1) {
-		if (!task_current(rq, p))
-			dequeue_pushable_task(rq, p);
-		BUG_ON(!rq->rt.rt_nr_migratory);
-		rq->rt.rt_nr_migratory--;
-	} else {
-		if (!task_current(rq, p))
-			enqueue_pushable_task(rq, p);
-		rq->rt.rt_nr_migratory++;
-	}
-
-	update_rt_migration(&rq->rt);
-
-done:
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = weight;
-}
-
 /* Assumes rq->lock is held */
 static void rq_online_rt(struct rq *rq)
 {
@@ -2313,7 +2270,7 @@ const struct sched_class rt_sched_class
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_rt,
 
-	.set_cpus_allowed       = set_cpus_allowed_rt,
+	.set_cpus_allowed       = set_cpus_allowed_common,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.post_schedule		= post_schedule_rt,



^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-05-15 15:43 ` [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context Peter Zijlstra
@ 2015-05-15 15:43 ` Peter Zijlstra
  2015-05-15 19:05   ` Rik van Riel
                     ` (3 more replies)
  2015-05-18  9:08 ` [RFC][PATCH 0/4] sched,numa: pinned task accounting Artem Bityutskiy
  4 siblings, 4 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-15 15:43 UTC (permalink / raw)
  To: mingo, riel; +Cc: dedekind1, linux-kernel, mgorman, peterz, rostedt, juri.lelli

[-- Attachment #1: peterz-sched-use-nr_pinned-numa.patch --]
[-- Type: text/plain, Size: 4590 bytes --]

Per cpu (kernel) threads can currently trick the load balancer in
thinking there's something to do and result in moving perfectly placed
tasks away.

By virtue of the new do_set_cpus_allowed() we can easily add nr_pinned
accounting. Which we'll use to change the fbq classification such that
we make it less likely to pick such tasks for migration.

Note that it is still possible we'll migrate these well placed tasks
away, further patches could reduce this still.

Suggested-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c  |   42 ++++++++++++++++++++++++++++++++++--------
 kernel/sched/sched.h |    1 +
 2 files changed, 35 insertions(+), 8 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -858,12 +858,20 @@ static unsigned int task_scan_max(struct
 
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
+	if (p->nr_cpus_allowed == 1) {
+		p->numa_preferred_nid = -1;
+		rq->nr_pinned_running++;
+	}
 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 }
 
 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
+	if (p->nr_cpus_allowed == 1) {
+		rq->nr_pinned_running--;
+		WARN_ON_ONCE(p->numa_preferred_nid != -1);
+	}
 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 }
@@ -2266,6 +2274,12 @@ void task_tick_numa(struct rq *rq, struc
 		return;
 
 	/*
+	 * We don't care about NUMA placement if we can't migrate the task.
+	 */
+	if (curr->nr_cpus_allowed == 1)
+		return;
+
+	/*
 	 * Using runtime rather than walltime has the dual advantage that
 	 * we (mostly) drive the selection from busy threads and that the
 	 * task needs to have done some actual work before we bother with
@@ -5567,7 +5581,7 @@ static bool yield_to_task_fair(struct rq
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
-enum fbq_type { regular, remote, all };
+enum fbq_type { regular, remote, movable, all };
 
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
@@ -6112,6 +6126,7 @@ struct sg_lb_stats {
 	enum group_type group_type;
 	int group_no_capacity;
 #ifdef CONFIG_NUMA_BALANCING
+	unsigned int nr_pinned_running;
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
@@ -6449,6 +6464,7 @@ static inline void update_sg_lb_stats(st
 			*overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
+		sgs->nr_pinned_running += rq->nr_pinned_running;
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
@@ -6522,19 +6538,27 @@ static bool update_sd_pick_busiest(struc
 #ifdef CONFIG_NUMA_BALANCING
 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running > sgs->nr_numa_running)
+	unsigned int nr_migratable = sgs->sum_nr_running - sgs->nr_pinned_running;
+
+	if (nr_migratable > sgs->nr_numa_running)
 		return regular;
-	if (sgs->sum_nr_running > sgs->nr_preferred_running)
+	if (nr_migratable > sgs->nr_preferred_running)
 		return remote;
+	if (nr_migratable)
+		return movable;
 	return all;
 }
 
 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 {
-	if (rq->nr_running > rq->nr_numa_running)
+	unsigned int nr_migratable = rq->cfs.h_nr_running - rq->nr_pinned_running;
+
+	if (nr_migratable > rq->nr_numa_running)
 		return regular;
-	if (rq->nr_running > rq->nr_preferred_running)
+	if (nr_migratable > rq->nr_preferred_running)
 		return remote;
+	if (nr_migratable)
+		return movable;
 	return all;
 }
 #else
@@ -6938,9 +6962,11 @@ static struct rq *find_busiest_queue(str
 
 		/*
 		 * We classify groups/runqueues into three groups:
-		 *  - regular: there are !numa tasks
-		 *  - remote:  there are numa tasks that run on the 'wrong' node
-		 *  - all:     there is no distinction
+		 *  - regular: there are (migratable) !numa tasks
+		 *  - remote:  there are (migratable) numa tasks that
+		 *             run on the 'wrong' node
+		 *  - movable: there are (migratable) tasks
+		 *  - all:     there are tasks
 		 *
 		 * In order to avoid migrating ideally placed numa tasks,
 		 * ignore those when there's better options.
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -571,6 +571,7 @@ struct rq {
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
+	unsigned int nr_pinned_running;
 #endif
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
@ 2015-05-15 15:56   ` Tejun Heo
  2015-08-07 14:27     ` Peter Zijlstra
  2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
  1 sibling, 1 reply; 32+ messages in thread
From: Tejun Heo @ 2015-05-15 15:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

On Fri, May 15, 2015 at 05:43:34PM +0200, Peter Zijlstra wrote:
> Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
> without locks, a caller might observe an old value and race with the
> set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
> it.
> 
> 	__kthread_bind()
> 	  do_set_cpus_allowed()
> 						<SYSCALL>
> 						  sched_setaffinity()
> 						    if (p->flags & PF_NO_SETAFFINITIY)
> 						    set_cpus_allowed_ptr()
> 	  p->flags |= PF_NO_SETAFFINITY
> 
> Fix the issue by putting everything under the regular scheduler locks.
> 
> This also closes a hole in the serialization of
> task_struct::{nr_,}cpus_allowed.
> 
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

For workqueue part,

 Acked-by: Tejun Heo <tj@kernel.org>

Please route anyway you see fit.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
@ 2015-05-15 19:05   ` Rik van Riel
  2015-05-16  9:31   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 32+ messages in thread
From: Rik van Riel @ 2015-05-15 19:05 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On 05/15/2015 11:43 AM, Peter Zijlstra wrote:

> Per cpu (kernel) threads can currently trick the load balancer in
> thinking there's something to do and result in moving perfectly placed
> tasks away.
> 
> By virtue of the new do_set_cpus_allowed() we can easily add nr_pinned
> accounting. Which we'll use to change the fbq classification such that
> we make it less likely to pick such tasks for migration.
> 
> Note that it is still possible we'll migrate these well placed tasks
> away, further patches could reduce this still.

A few ideas here :)

- Identify whether the current task on a CPU is the one
  that would like to migrate, and treat the runqueue as
  "movable" instead of "remote" if none of the runnable
  but queued tasks want to migrate away.

- Teach can_migrate_task about the fbq types, and have
  it enforce NUMA placement when the type is "remote",
  and ignore NUMA placement when we know there are no
  good tasks to move.  This might require another
  simplification of migrate_improves/degrades_locality.

> Suggested-by: Rik van Riel <riel@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Reviewed-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
  2015-05-15 19:05   ` Rik van Riel
@ 2015-05-16  9:31   ` Peter Zijlstra
  2015-05-18 13:00   ` Srikar Dronamraju
  2015-05-18 13:10   ` Srikar Dronamraju
  3 siblings, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-16  9:31 UTC (permalink / raw)
  To: mingo, riel; +Cc: dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On Fri, May 15, 2015 at 05:43:37PM +0200, Peter Zijlstra wrote:
>  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
>  {
> +	if (p->nr_cpus_allowed == 1) {
> +		p->numa_preferred_nid = -1;
> +		rq->nr_pinned_running++;
> +	}
>  	rq->nr_numa_running += (p->numa_preferred_nid != -1);
>  	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
>  }

>  static inline enum fbq_type fbq_classify_rq(struct rq *rq)
>  {
> +	unsigned int nr_migratable = rq->cfs.h_nr_running - rq->nr_pinned_running;
> +

FWIW, there's a problem there with CFS bandwidth muck. When we throttle
groups we update cfs.h_nr_running properly, but we do not hierarchically
account the pinned, preferred and numa counts.

So that above subtraction can end up negative.

I've not yet decided what to do about this; ideally we'd do the
hierarchical accounting of the numa stats -- but that's a little bit
more expensive than I'd like.

A well. for monday that.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context
       [not found]   ` <OF66BF3765.2EBFD3B1-ON48257E49.0028DC79-48257E49.0029F058@zte.com.cn>
@ 2015-05-18  8:32     ` Peter Zijlstra
  2015-05-18  9:34       ` Juri Lelli
  2015-05-18 20:04       ` Peter Zijlstra
  0 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-18  8:32 UTC (permalink / raw)
  To: pang.xunlei
  Cc: dedekind1, juri.lelli, linux-kernel, linux-kernel-owner, mgorman,
	mingo, riel, rostedt

On Mon, May 18, 2015 at 03:37:43PM +0800, pang.xunlei@zte.com.cn wrote:
> Hi Peter,
> 
> With this modification, I think the pushing action in my previous patch 
> "Check to push the task away after its affinity was changed" will not
> be able to be implemented inside sched_class::set_cpus_allowed().

Ah, right, I knew there was a patch I needed to look at.

I'll try and not forget, but there's a few regression reports I need to
look at first.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 0/4] sched,numa: pinned task accounting
  2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
@ 2015-05-18  9:08 ` Artem Bityutskiy
  4 siblings, 0 replies; 32+ messages in thread
From: Artem Bityutskiy @ 2015-05-18  9:08 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: mingo, riel, linux-kernel, mgorman, rostedt, juri.lelli

On Fri, 2015-05-15 at 17:43 +0200, Peter Zijlstra wrote:
> Hi,
> 
> Here's a first few patches that provide pinned task accounting; they are boot
> tested only so far.
> 
> I don't think this is enough to address Artem's regression, but its a
> foundation to add some more smarts. In particular we should make it harder
> still to migrate well placed tasks away due to these pinned tasks.

Hi,

the most useful thing I can do is to test your patches, which I did, and
there seem to be no difference: average server response time is still
1.4 seconds.

Artem.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context
  2015-05-18  8:32     ` Peter Zijlstra
@ 2015-05-18  9:34       ` Juri Lelli
  2015-05-18 20:04       ` Peter Zijlstra
  1 sibling, 0 replies; 32+ messages in thread
From: Juri Lelli @ 2015-05-18  9:34 UTC (permalink / raw)
  To: Peter Zijlstra, pang.xunlei
  Cc: dedekind1, linux-kernel, linux-kernel-owner, mgorman, mingo,
	riel, rostedt

Hi Peter,

On 05/18/2015 09:32 AM, Peter Zijlstra wrote:
> On Mon, May 18, 2015 at 03:37:43PM +0800, pang.xunlei@zte.com.cn wrote:
>> Hi Peter,
>>
>> With this modification, I think the pushing action in my previous patch 
>> "Check to push the task away after its affinity was changed" will not
>> be able to be implemented inside sched_class::set_cpus_allowed().
> 
> Ah, right, I knew there was a patch I needed to look at.
> 
> I'll try and not forget, but there's a few regression reports I need to
> look at first.
> 

Apart from this (and the fact that I still have to look at Xunlei's patches too)
the changes seem ok with DL. Didn't test them yet though. I'll do it soon.

Best,

- Juri


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
  2015-05-15 19:05   ` Rik van Riel
  2015-05-16  9:31   ` Peter Zijlstra
@ 2015-05-18 13:00   ` Srikar Dronamraju
  2015-05-18 13:06     ` Peter Zijlstra
  2015-05-18 13:10   ` Srikar Dronamraju
  3 siblings, 1 reply; 32+ messages in thread
From: Srikar Dronamraju @ 2015-05-18 13:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

>  
>  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
>  {
> +	if (p->nr_cpus_allowed == 1) {
> +		rq->nr_pinned_running--;
> +		WARN_ON_ONCE(p->numa_preferred_nid != -1);
> +	}
>  	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
>  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
>  }


Shouldnt we reset p->numa_preferred_nid when we are setting the allowed
cpus in set_cpus_allowed_common()? 

Otherwise if an process is set a preferred node based on its numa faults
but then is pinned to a different cpu, then we can see this warning.:w!


-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-18 13:00   ` Srikar Dronamraju
@ 2015-05-18 13:06     ` Peter Zijlstra
  2015-05-18 14:13       ` Rik van Riel
  2015-05-18 14:29       ` Srikar Dronamraju
  0 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-18 13:06 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On Mon, 2015-05-18 at 18:30 +0530, Srikar Dronamraju wrote:
> >  
> >  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
> >  {
> > +	if (p->nr_cpus_allowed == 1) {
> > +		rq->nr_pinned_running--;
> > +		WARN_ON_ONCE(p->numa_preferred_nid != -1);
> > +	}
> >  	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
> >  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
> >  }
> 
> 
> Shouldnt we reset p->numa_preferred_nid when we are setting the allowed
> cpus in set_cpus_allowed_common()? 
> 
> Otherwise if an process is set a preferred node based on its numa faults
> but then is pinned to a different cpu, then we can see this warning.:w!

We should never get preferred_nid set when nr_cpus_allowed == 1, see the
hunk that changes task_tick_numa.

So we set preferred = -1 on pinning, do not partake in numa balancing
while this is so, therefore it should still be so when we dequeue,
right?



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
                     ` (2 preceding siblings ...)
  2015-05-18 13:00   ` Srikar Dronamraju
@ 2015-05-18 13:10   ` Srikar Dronamraju
  3 siblings, 0 replies; 32+ messages in thread
From: Srikar Dronamraju @ 2015-05-18 13:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

>  
>  		/*
>  		 * We classify groups/runqueues into three groups:

Nit: Should we update the above from "three groups" to "four groups"?

> -		 *  - regular: there are !numa tasks
> -		 *  - remote:  there are numa tasks that run on the 'wrong' node
> -		 *  - all:     there is no distinction
> +		 *  - regular: there are (migratable) !numa tasks
> +		 *  - remote:  there are (migratable) numa tasks that
> +		 *             run on the 'wrong' node
> +		 *  - movable: there are (migratable) tasks
> +		 *  - all:     there are tasks
>  		 *
>  		 * In order to avoid migrating ideally placed numa tasks,
>  		 * ignore those when there's better options.

-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-18 13:06     ` Peter Zijlstra
@ 2015-05-18 14:13       ` Rik van Riel
  2015-05-18 14:29       ` Srikar Dronamraju
  1 sibling, 0 replies; 32+ messages in thread
From: Rik van Riel @ 2015-05-18 14:13 UTC (permalink / raw)
  To: Peter Zijlstra, Srikar Dronamraju
  Cc: mingo, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On 05/18/2015 09:06 AM, Peter Zijlstra wrote:
> On Mon, 2015-05-18 at 18:30 +0530, Srikar Dronamraju wrote:
>>>  
>>>  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
>>>  {
>>> +	if (p->nr_cpus_allowed == 1) {
>>> +		rq->nr_pinned_running--;
>>> +		WARN_ON_ONCE(p->numa_preferred_nid != -1);
>>> +	}
>>>  	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
>>>  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
>>>  }
>>
>>
>> Shouldnt we reset p->numa_preferred_nid when we are setting the allowed
>> cpus in set_cpus_allowed_common()? 
>>
>> Otherwise if an process is set a preferred node based on its numa faults
>> but then is pinned to a different cpu, then we can see this warning.:w!
> 
> We should never get preferred_nid set when nr_cpus_allowed == 1, see the
> hunk that changes task_tick_numa.
> 
> So we set preferred = -1 on pinning, do not partake in numa balancing
> while this is so, therefore it should still be so when we dequeue,
> right?

It could be pinned after it has been running for a while,
with taskset -c <cpu> -p <pid>

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-18 13:06     ` Peter Zijlstra
  2015-05-18 14:13       ` Rik van Riel
@ 2015-05-18 14:29       ` Srikar Dronamraju
  2015-05-18 15:09         ` Peter Zijlstra
  1 sibling, 1 reply; 32+ messages in thread
From: Srikar Dronamraju @ 2015-05-18 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

* Peter Zijlstra <peterz@infradead.org> [2015-05-18 15:06:58]:

> On Mon, 2015-05-18 at 18:30 +0530, Srikar Dronamraju wrote:
> > >  
> > >  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
> > >  {
> > > +	if (p->nr_cpus_allowed == 1) {
> > > +		rq->nr_pinned_running--;
> > > +		WARN_ON_ONCE(p->numa_preferred_nid != -1);
> > > +	}
> > >  	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
> > >  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
> > >  }
> > 
> > 
> > Shouldnt we reset p->numa_preferred_nid when we are setting the allowed
> > cpus in set_cpus_allowed_common()? 
> > 
> > Otherwise if an process is set a preferred node based on its numa faults
> > but then is pinned to a different cpu, then we can see this warning.:w!
> 
> We should never get preferred_nid set when nr_cpus_allowed == 1, see the
> hunk that changes task_tick_numa.
> 
> So we set preferred = -1 on pinning, do not partake in numa balancing
> while this is so, therefore it should still be so when we dequeue,
> right?

lets say if a thread were to do a sched_setaffinity on itself ;
would it not call account_numa_dequeue before account_numa_enqueue?


Also setting preferred = -1 in set_cpus_allowed avoids us from setting
it at account_numa_enqueue. account_numa_enqueue() would probably be
called more times than set_cpus_allowed.

> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks
  2015-05-18 14:29       ` Srikar Dronamraju
@ 2015-05-18 15:09         ` Peter Zijlstra
  0 siblings, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-18 15:09 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On Mon, May 18, 2015 at 07:59:40PM +0530, Srikar Dronamraju wrote:
> > We should never get preferred_nid set when nr_cpus_allowed == 1, see the
> > hunk that changes task_tick_numa.
> > 
> > So we set preferred = -1 on pinning, do not partake in numa balancing
> > while this is so, therefore it should still be so when we dequeue,
> > right?
> 
> lets say if a thread were to do a sched_setaffinity on itself ;
> would it not call account_numa_dequeue before account_numa_enqueue?

Yes, but it would call dequeue while nr_cpus_allowed was still the 'old'
value. And it will call enqueue when its the 'new' value.

> Also setting preferred = -1 in set_cpus_allowed avoids us from setting
> it at account_numa_enqueue. account_numa_enqueue() would probably be
> called more times than set_cpus_allowed.

This is true; and that makes sense.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context
  2015-05-18  8:32     ` Peter Zijlstra
  2015-05-18  9:34       ` Juri Lelli
@ 2015-05-18 20:04       ` Peter Zijlstra
  1 sibling, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-05-18 20:04 UTC (permalink / raw)
  To: pang.xunlei
  Cc: dedekind1, juri.lelli, linux-kernel, linux-kernel-owner, mgorman,
	mingo, riel, rostedt

On Mon, May 18, 2015 at 10:32:16AM +0200, Peter Zijlstra wrote:
> On Mon, May 18, 2015 at 03:37:43PM +0800, pang.xunlei@zte.com.cn wrote:
> > Hi Peter,
> > 
> > With this modification, I think the pushing action in my previous patch 
> > "Check to push the task away after its affinity was changed" will not
> > be able to be implemented inside sched_class::set_cpus_allowed().
> 
> Ah, right, I knew there was a patch I needed to look at.

So basically you want to do:

+check_push:
+       if (weight > 1 &&
+           !task_running(rq, p) &&
+           !test_tsk_need_resched(rq->curr) &&
+           !cpumask_subset(new_mask, &p->cpus_allowed)) {
+               /* Update new affinity and try to push. */
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->nr_cpus_allowed = weight;
+               push_rt_tasks(rq);
+               return true;
+       }

in set_cpus_allowed_rt(), which would not work because of us calling
put_prev_task(), which does enqueue_pushable_task() and would allow
pick_next_pushable_task() to select the current task, which would then
BUG_ON().

Note however that you already test for !task_running(), which precludes
that entire argument, because if @p is not running, we did not call
put_prev_task() etc..

So I think the above would still work; albeit it needs a comment on why
etc..

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-05-15 15:56   ` Tejun Heo
@ 2015-08-07 14:27     ` Peter Zijlstra
  2015-08-07 15:16       ` Tejun Heo
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-07 14:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

On Fri, May 15, 2015 at 11:56:53AM -0400, Tejun Heo wrote:
> On Fri, May 15, 2015 at 05:43:34PM +0200, Peter Zijlstra wrote:
> > Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
> > without locks, a caller might observe an old value and race with the
> > set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
> > it.
> > 
> > 	__kthread_bind()
> > 	  do_set_cpus_allowed()
> > 						<SYSCALL>
> > 						  sched_setaffinity()
> > 						    if (p->flags & PF_NO_SETAFFINITIY)
> > 						    set_cpus_allowed_ptr()
> > 	  p->flags |= PF_NO_SETAFFINITY
> > 
> > Fix the issue by putting everything under the regular scheduler locks.
> > 
> > This also closes a hole in the serialization of
> > task_struct::{nr_,}cpus_allowed.
> > 
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Oleg Nesterov <oleg@redhat.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> For workqueue part,
> 
>  Acked-by: Tejun Heo <tj@kernel.org>

Sorry be being very late on this, got sidetracked with other bits.

This threw up a warning on testing:

[    2.443944] WARNING: CPU: 0 PID: 10 at kernel/kthread.c:333 __kthread_bind_mask+0x34/0x6e()
[    2.446978] Modules linked in:
[    2.448359] CPU: 0 PID: 10 Comm: khelper Not tainted 4.1.0-rc6-00314-g6455666 #4
[    2.450990] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[    2.454132]  0000000000000009 ffff88000f643d68 ffffffff81a3df14 0000000000000b02
[    2.470295]  0000000000000000 ffff88000f643da8 ffffffff810f308f 000000000f643da8
[    2.503291]  ffffffff8110d116 ffff88000f55d580 ffff88000f5240c0 ffff88000f4936e0
[    2.506510] Call Trace:
[    2.520770]  [<ffffffff81a3df14>] dump_stack+0x4c/0x65
[    2.522479]  [<ffffffff810f308f>] warn_slowpath_common+0xa1/0xbb
[    2.524334]  [<ffffffff8110d116>] ? __kthread_bind_mask+0x34/0x6e
[    2.526219]  [<ffffffff810f314c>] warn_slowpath_null+0x1a/0x1c
[    2.528069]  [<ffffffff8110d116>] __kthread_bind_mask+0x34/0x6e
[    2.529925]  [<ffffffff8110d381>] kthread_bind_mask+0x13/0x15
[    2.531738]  [<ffffffff8110679d>] worker_attach_to_pool+0x39/0x7c
[    2.546650]  [<ffffffff8110866b>] rescuer_thread+0x130/0x318
[    2.548484]  [<ffffffff8110853b>] ? cancel_delayed_work_sync+0x15/0x15
[    2.550411]  [<ffffffff8110853b>] ? cancel_delayed_work_sync+0x15/0x15
[    2.552207]  [<ffffffff8110cd0f>] kthread+0xf8/0x100
[    2.553864]  [<ffffffff8110cc17>] ? kthread_create_on_node+0x184/0x184
[    2.555795]  [<ffffffff81a457c2>] ret_from_fork+0x42/0x70
[    2.557538]  [<ffffffff8110cc17>] ? kthread_create_on_node+0x184/0x184
[    2.572520] ---[ end trace 362b92c9255ab666 ]---

Which is the rescue thread attaching itself to a pool that needs help,
and obviously the rescue thread isn't new so kthread_bind doesn't work
right.

The best I could come up with is something like the below on top; does
that work for you? I'll go give it some runtime.

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1622,11 +1622,15 @@ static struct worker *alloc_worker(int n
  * cpu-[un]hotplugs.
  */
 static void worker_attach_to_pool(struct worker *worker,
-				   struct worker_pool *pool)
+				   struct worker_pool *pool,
+				   bool new)
 {
 	mutex_lock(&pool->attach_mutex);
 
-	kthread_bind_mask(worker->task, pool->attrs->cpumask);
+	if (new)
+		kthread_bind_mask(worker->task, pool->attrs->cpumask);
+	else
+		set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
 	/*
 	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -1712,7 +1716,7 @@ static struct worker *create_worker(stru
 	set_user_nice(worker->task, pool->attrs->nice);
 
 	/* successful, attach the worker to the pool */
-	worker_attach_to_pool(worker, pool);
+	worker_attach_to_pool(worker, pool, true);
 
 	/* start the newly created worker */
 	spin_lock_irq(&pool->lock);
@@ -2241,7 +2245,7 @@ static int rescuer_thread(void *__rescue
 
 		spin_unlock_irq(&wq_mayday_lock);
 
-		worker_attach_to_pool(rescuer, pool);
+		worker_attach_to_pool(rescuer, pool, false);
 
 		spin_lock_irq(&pool->lock);
 		rescuer->pool = pool;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-08-07 14:27     ` Peter Zijlstra
@ 2015-08-07 15:16       ` Tejun Heo
  2015-08-07 15:29         ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Tejun Heo @ 2015-08-07 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

On Fri, Aug 07, 2015 at 04:27:08PM +0200, Peter Zijlstra wrote:
> Which is the rescue thread attaching itself to a pool that needs help,
> and obviously the rescue thread isn't new so kthread_bind doesn't work
> right.
> 
> The best I could come up with is something like the below on top; does
> that work for you? I'll go give it some runtime.
> 
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1622,11 +1622,15 @@ static struct worker *alloc_worker(int n
>   * cpu-[un]hotplugs.
>   */
>  static void worker_attach_to_pool(struct worker *worker,
> -				   struct worker_pool *pool)
> +				   struct worker_pool *pool,
> +				   bool new)
>  {
>  	mutex_lock(&pool->attach_mutex);
>  
> -	kthread_bind_mask(worker->task, pool->attrs->cpumask);
> +	if (new)
> +		kthread_bind_mask(worker->task, pool->attrs->cpumask);
> +	else
> +		set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
>  
>  	/*
>  	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
> @@ -1712,7 +1716,7 @@ static struct worker *create_worker(stru
>  	set_user_nice(worker->task, pool->attrs->nice);
>  
>  	/* successful, attach the worker to the pool */
> -	worker_attach_to_pool(worker, pool);
> +	worker_attach_to_pool(worker, pool, true);
>  
>  	/* start the newly created worker */
>  	spin_lock_irq(&pool->lock);
> @@ -2241,7 +2245,7 @@ static int rescuer_thread(void *__rescue
>  
>  		spin_unlock_irq(&wq_mayday_lock);
>  
> -		worker_attach_to_pool(rescuer, pool);
> +		worker_attach_to_pool(rescuer, pool, false);

Hmmm... the race condition didn't exist for workqueue in the first
place, right?  As long as the flag is set before the affinity is
configured, there's no race condition.  I think the code was better
before.  Can't we just revert workqueue.c part?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-08-07 15:16       ` Tejun Heo
@ 2015-08-07 15:29         ` Peter Zijlstra
  2015-08-07 15:38           ` Tejun Heo
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-07 15:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

On Fri, Aug 07, 2015 at 11:16:08AM -0400, Tejun Heo wrote:
> On Fri, Aug 07, 2015 at 04:27:08PM +0200, Peter Zijlstra wrote:
> > Which is the rescue thread attaching itself to a pool that needs help,
> > and obviously the rescue thread isn't new so kthread_bind doesn't work
> > right.
> > 
> > The best I could come up with is something like the below on top; does
> > that work for you? I'll go give it some runtime.
> > 
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -1622,11 +1622,15 @@ static struct worker *alloc_worker(int n
> >   * cpu-[un]hotplugs.
> >   */
> >  static void worker_attach_to_pool(struct worker *worker,
> > -				   struct worker_pool *pool)
> > +				   struct worker_pool *pool,
> > +				   bool new)
> >  {
> >  	mutex_lock(&pool->attach_mutex);
> >  
> > -	kthread_bind_mask(worker->task, pool->attrs->cpumask);
> > +	if (new)
> > +		kthread_bind_mask(worker->task, pool->attrs->cpumask);
> > +	else
> > +		set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
> >  
> >  	/*
> >  	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
> > @@ -1712,7 +1716,7 @@ static struct worker *create_worker(stru
> >  	set_user_nice(worker->task, pool->attrs->nice);
> >  
> >  	/* successful, attach the worker to the pool */
> > -	worker_attach_to_pool(worker, pool);
> > +	worker_attach_to_pool(worker, pool, true);
> >  
> >  	/* start the newly created worker */
> >  	spin_lock_irq(&pool->lock);
> > @@ -2241,7 +2245,7 @@ static int rescuer_thread(void *__rescue
> >  
> >  		spin_unlock_irq(&wq_mayday_lock);
> >  
> > -		worker_attach_to_pool(rescuer, pool);
> > +		worker_attach_to_pool(rescuer, pool, false);
> 
> Hmmm... the race condition didn't exist for workqueue in the first
> place, right?

No, I think workqueues are susceptible just the same as everybody else.
By the time we call __kthread_bind() the task exists and is visible to
userspace.

        __kthread_bind()
          do_set_cpus_allowed()
                                                <SYSCALL>
                                                  sched_setaffinity()
                                                    if (p->flags & PF_NO_SETAFFINITIY) /* false-not-taken */
                                                    set_cpus_allowed_ptr()
          p->flags |= PF_NO_SETAFFINITY

> As long as the flag is set before the affinity is configured, there's
> no race condition.

Even if we were to strictly order those stores you could have (note
there is no matching barrier, as there is only the one load, so ordering
cannot help):

	__kthread_bind()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITY) /* false-not-taken */
	  p->flags |= PF_NO_SETAFFINITY;
	  smp_wmb();
	  do_set_cpus_allowed();
						    set_cpus_allowed_ptr()

> I think the code was better before.  Can't we just revert workqueue.c
> part?

I agree that the new argument isn't pretty, but I cannot see how
workqueues would not be affected by this.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-08-07 15:29         ` Peter Zijlstra
@ 2015-08-07 15:38           ` Tejun Heo
  2015-08-07 15:59             ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Tejun Heo @ 2015-08-07 15:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

Hello,

On Fri, Aug 07, 2015 at 05:29:56PM +0200, Peter Zijlstra wrote:
> Even if we were to strictly order those stores you could have (note
> there is no matching barrier, as there is only the one load, so ordering
> cannot help):
> 
> 	__kthread_bind()
> 						<SYSCALL>
> 						  sched_setaffinity()
> 						    if (p->flags & PF_NO_SETAFFINITY) /* false-not-taken */
> 	  p->flags |= PF_NO_SETAFFINITY;
> 	  smp_wmb();
> 	  do_set_cpus_allowed();
> 						    set_cpus_allowed_ptr()
>
> > I think the code was better before.  Can't we just revert workqueue.c
> > part?
> 
> I agree that the new argument isn't pretty, but I cannot see how
> workqueues would not be affected by this.

So, the problem there is that __kthread_bind() doesn't grab the same
lock that the syscall side grabs but workqueue used
set_cpus_allowed_ptr() which goes through the rq locking, so as long
as the check on syscall side is movied inside rq lock, it should be
fine.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-08-07 15:38           ` Tejun Heo
@ 2015-08-07 15:59             ` Peter Zijlstra
  2015-08-07 16:09               ` Tejun Heo
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-07 15:59 UTC (permalink / raw)
  To: Tejun Heo
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

On Fri, Aug 07, 2015 at 11:38:28AM -0400, Tejun Heo wrote:
> Hello,
> 
> On Fri, Aug 07, 2015 at 05:29:56PM +0200, Peter Zijlstra wrote:
> > Even if we were to strictly order those stores you could have (note
> > there is no matching barrier, as there is only the one load, so ordering
> > cannot help):
> > 
> > 	__kthread_bind()
> > 						<SYSCALL>
> > 						  sched_setaffinity()
> > 						    if (p->flags & PF_NO_SETAFFINITY) /* false-not-taken */
> > 	  p->flags |= PF_NO_SETAFFINITY;
> > 	  smp_wmb();
> > 	  do_set_cpus_allowed();
> > 						    set_cpus_allowed_ptr()
> >
> > > I think the code was better before.  Can't we just revert workqueue.c
> > > part?
> > 
> > I agree that the new argument isn't pretty, but I cannot see how
> > workqueues would not be affected by this.
> 
> So, the problem there is that __kthread_bind() doesn't grab the same
> lock that the syscall side grabs but workqueue used
> set_cpus_allowed_ptr() which goes through the rq locking, so as long
> as the check on syscall side is movied inside rq lock, it should be
> fine.

Currently neither site uses any lock, and that is what the patch fixes
(it uses the per-task ->pi_lock instead of the rq->lock, but that is
immaterial).

What matters though is that you now must hold a scheduler lock while
setting PF_NO_SETAFFINITY. In order to avoid spreading that knowledge
around I've taught kthread_bind*() about this and made the workqueue
code use that API (rather than having the workqueue code take scheduler
locks).

Hmm.. a better solution. Have the worker thread creation call
kthread_bind_mask() before attach_to_pool() and have attach_to_pool()
keep using set_cpus_allowed_ptr(). Less ugly.

---
Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity()
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:34 +0200

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it.

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the issue by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of
task_struct::{nr_,}cpus_allowed.

Cc: riel@redhat.com
Cc: dedekind1@gmail.com
Cc: mgorman@suse.de
Cc: rostedt@goodmis.org
Cc: juri.lelli@arm.com
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: mingo@kernel.org
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org
---
 include/linux/kthread.h |    1 +
 include/linux/sched.h   |    7 -------
 kernel/kthread.c        |   20 +++++++++++++++++---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++++++++++----
 kernel/workqueue.c      |    6 ++----
 5 files changed, 52 insertions(+), 18 deletions(-)

--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle(
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-	return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
+	unsigned long flags;
+
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
+
 	/* It's safe because the task is inactive. */
-	do_set_cpus_allowed(p, cpumask_of(cpu));
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+	__kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 
 /**
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,6 +1151,8 @@ static int migration_cpu_stop(void *data
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -1167,7 +1169,8 @@ void do_set_cpus_allowed(struct task_str
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -1176,6 +1179,15 @@ int set_cpus_allowed_ptr(struct task_str
 
 	rq = task_rq_lock(p, &flags);
 
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
@@ -1212,6 +1224,11 @@ int set_cpus_allowed_ptr(struct task_str
 
 	return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1593,6 +1610,15 @@ static void update_avg(u64 *avg, u64 sam
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -4338,7 +4364,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -4863,7 +4889,8 @@ void init_idle(struct task_struct *idle,
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
@@ -4889,7 +4916,8 @@ void init_idle(struct task_struct *idle,
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(stru
 		goto fail;
 
 	set_user_nice(worker->task, pool->attrs->nice);
-
-	/* prevent userland from meddling with cpumask of workqueue workers */
-	worker->task->flags |= PF_NO_SETAFFINITY;
+	kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workque
 		}
 
 		wq->rescuer = rescuer;
-		rescuer->task->flags |= PF_NO_SETAFFINITY;
+		kthread_bind_mask(rescuer->task, cpu_possible_mask);
 		wake_up_process(rescuer->task);
 	}
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-08-07 15:59             ` Peter Zijlstra
@ 2015-08-07 16:09               ` Tejun Heo
  0 siblings, 0 replies; 32+ messages in thread
From: Tejun Heo @ 2015-08-07 16:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, riel, dedekind1, linux-kernel, mgorman, rostedt,
	juri.lelli, Oleg Nesterov

Hello, Peter.

On Fri, Aug 07, 2015 at 05:59:54PM +0200, Peter Zijlstra wrote:
> > So, the problem there is that __kthread_bind() doesn't grab the same
> > lock that the syscall side grabs but workqueue used
> > set_cpus_allowed_ptr() which goes through the rq locking, so as long
> > as the check on syscall side is movied inside rq lock, it should be
> > fine.
> 
> Currently neither site uses any lock, and that is what the patch fixes
> (it uses the per-task ->pi_lock instead of the rq->lock, but that is
> immaterial).

Yeap, the testing on the syscall side should definitely be moved
inside rq->lock.

> What matters though is that you now must hold a scheduler lock while
> setting PF_NO_SETAFFINITY. In order to avoid spreading that knowledge
> around I've taught kthread_bind*() about this and made the workqueue
> code use that API (rather than having the workqueue code take scheduler
> locks).

So, as long as PF_NO_SETAFFINITY is set before the task sets its
affinity to its target holding the rq lock, it should still be safe.

> Hmm.. a better solution. Have the worker thread creation call
> kthread_bind_mask() before attach_to_pool() and have attach_to_pool()
> keep using set_cpus_allowed_ptr(). Less ugly.

Yeah, that works too.  About the same effect.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [tip:sched/core] sched: Fix a race between __kthread_bind() and sched_setaffinity()
  2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
  2015-05-15 15:56   ` Tejun Heo
@ 2015-08-12 12:38   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 32+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-08-12 12:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, oleg, mingo, peterz, efault, tj, torvalds, tglx

Commit-ID:  25834c73f93af7f0712c98ca4593691592e6b360
Gitweb:     http://git.kernel.org/tip/25834c73f93af7f0712c98ca4593691592e6b360
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 15 May 2015 17:43:34 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 12 Aug 2015 12:06:09 +0200

sched: Fix a race between __kthread_bind() and sched_setaffinity()

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it:

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the bug by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kthread.h |  1 +
 include/linux/sched.h   |  7 -------
 kernel/kthread.c        | 20 +++++++++++++++++---
 kernel/sched/core.c     | 36 ++++++++++++++++++++++++++++++++----
 kernel/workqueue.c      |  6 ++----
 5 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 13d5520..869b21d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44dca5b..81bb457 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2203,13 +2203,6 @@ static inline void calc_load_enter_idle(void) { }
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-	return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c..7c40a18 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
+	unsigned long flags;
+
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
+
 	/* It's safe because the task is inactive. */
-	do_set_cpus_allowed(p, cpumask_of(cpu));
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+	__kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea6d743..2e3b983 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
 	rq = task_rq_lock(p, &flags);
 
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
@@ -1214,6 +1226,11 @@ out:
 
 	return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
@@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f061..f5782d5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 		goto fail;
 
 	set_user_nice(worker->task, pool->attrs->nice);
-
-	/* prevent userland from meddling with cpumask of workqueue workers */
-	worker->task->flags |= PF_NO_SETAFFINITY;
+	kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		}
 
 		wq->rescuer = rescuer;
-		rescuer->task->flags |= PF_NO_SETAFFINITY;
+		kthread_bind_mask(rescuer->task, cpu_possible_mask);
 		wake_up_process(rescuer->task);
 	}
 

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [tip:sched/core] sched: Make sched_class::set_cpus_allowed() unconditional
  2015-05-15 15:43 ` [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional Peter Zijlstra
@ 2015-08-12 12:38   ` tip-bot for Peter Zijlstra
  2015-08-20 16:45   ` [RFC][PATCH 2/4] " Sasha Levin
  1 sibling, 0 replies; 32+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-08-12 12:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, linux-kernel, efault, torvalds, tglx, hpa, peterz

Commit-ID:  c5b2803840817115e9b568d5054e5007ae36176b
Gitweb:     http://git.kernel.org/tip/c5b2803840817115e9b568d5054e5007ae36176b
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 15 May 2015 17:43:35 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 12 Aug 2015 12:06:09 +0200

sched: Make sched_class::set_cpus_allowed() unconditional

Give every class a set_cpus_allowed() method, this enables some small
optimization in the RT,DL implementation by avoiding a double
cpumask_weight() call.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.614517487@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 17 +++++++++++------
 kernel/sched/deadline.c  | 20 ++++++++++++--------
 kernel/sched/fair.c      |  1 +
 kernel/sched/idle_task.c |  1 +
 kernel/sched/rt.c        | 12 ++++++++----
 kernel/sched/sched.h     |  2 ++
 kernel/sched/stop_task.c |  1 +
 7 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e3b983..740f90b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,17 +1151,22 @@ static int migration_cpu_stop(void *data)
 	return 0;
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-	lockdep_assert_held(&p->pi_lock);
-
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
 	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	lockdep_assert_held(&p->pi_lock);
+	p->sched_class->set_cpus_allowed(p, new_mask);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 20772ee..dc357fa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1696,13 +1696,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	/*
-	 * Update only if the task is actually running (i.e.,
-	 * it is on the rq AND it is not throttled).
-	 */
-	if (!on_dl_rq(&p->dl))
-		return;
-
 	weight = cpumask_weight(new_mask);
 
 	/*
@@ -1710,7 +1703,14 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 	 * can migrate or not.
 	 */
 	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		return;
+		goto done;
+
+	/*
+	 * Update only if the task is actually running (i.e.,
+	 * it is on the rq AND it is not throttled).
+	 */
+	if (!on_dl_rq(&p->dl))
+		goto done;
 
 	/*
 	 * The process used to be able to migrate OR it can now migrate
@@ -1727,6 +1727,10 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 	}
 
 	update_dl_migration(&rq->dl);
+
+done:
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0950fd..6e2e348 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8252,6 +8252,7 @@ const struct sched_class fair_sched_class = {
 
 	.task_waking		= task_waking_fair,
 	.task_dead		= task_dead_fair,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_fair,
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8..c4ae0f1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00816ee..63692ef 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2084,9 +2084,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 
 	BUG_ON(!rt_task(p));
 
-	if (!task_on_rq_queued(p))
-		return;
-
 	weight = cpumask_weight(new_mask);
 
 	/*
@@ -2094,7 +2091,10 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	 * can migrate or not.
 	 */
 	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		return;
+		goto done;
+
+	if (!task_on_rq_queued(p))
+		goto done;
 
 	rq = task_rq(p);
 
@@ -2113,6 +2113,10 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	}
 
 	update_rt_migration(&rq->rt);
+
+done:
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 22ccc55..68cda11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1255,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
 #else
 
 static inline void idle_enter_fair(struct rq *rq) { }
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec4..cbc67da 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_stop,
+	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
 
 	.set_curr_task          = set_curr_task_stop,

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) calling context
  2015-05-15 15:43 ` [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context Peter Zijlstra
       [not found]   ` <OF66BF3765.2EBFD3B1-ON48257E49.0028DC79-48257E49.0029F058@zte.com.cn>
@ 2015-08-12 12:38   ` tip-bot for Peter Zijlstra
  2015-08-13 18:47     ` Sasha Levin
  1 sibling, 1 reply; 32+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-08-12 12:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, linux-kernel, tglx, peterz, torvalds, hpa, efault

Commit-ID:  6c37067e27867db172b988cc11b9ff921175dee5
Gitweb:     http://git.kernel.org/tip/6c37067e27867db172b988cc11b9ff921175dee5
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 15 May 2015 17:43:36 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 12 Aug 2015 12:06:10 +0200

sched: Change the sched_class::set_cpus_allowed() calling context

Change the calling context of sched_class::set_cpus_allowed() such
that we can assume the task is inactive.

This allows us to easily make changes that affect accounting done by
enqueue/dequeue. This does in fact completely remove
set_cpus_allowed_rt() and greatly reduces set_cpus_allowed_dl().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.667516139@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 23 +++++++++++++++++++++++
 kernel/sched/deadline.c | 39 ++-------------------------------------
 kernel/sched/rt.c       | 45 +--------------------------------------------
 3 files changed, 26 insertions(+), 81 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 740f90b..9917c96 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1163,8 +1163,31 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
 	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+		dequeue_task(rq, p, 0);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
 	p->sched_class->set_cpus_allowed(p, new_mask);
+
+	if (running)
+		p->sched_class->set_curr_task(rq);
+	if (queued)
+		enqueue_task(rq, p, 0);
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index dc357fa..b473056 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1668,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 static void set_cpus_allowed_dl(struct task_struct *p,
 				const struct cpumask *new_mask)
 {
-	struct rq *rq;
 	struct root_domain *src_rd;
-	int weight;
+	struct rq *rq;
 
 	BUG_ON(!dl_task(p));
 
@@ -1696,41 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	weight = cpumask_weight(new_mask);
-
-	/*
-	 * Only update if the process changes its state from whether it
-	 * can migrate or not.
-	 */
-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		goto done;
-
-	/*
-	 * Update only if the task is actually running (i.e.,
-	 * it is on the rq AND it is not throttled).
-	 */
-	if (!on_dl_rq(&p->dl))
-		goto done;
-
-	/*
-	 * The process used to be able to migrate OR it can now migrate
-	 */
-	if (weight <= 1) {
-		if (!task_current(rq, p))
-			dequeue_pushable_dl_task(rq, p);
-		BUG_ON(!rq->dl.dl_nr_migratory);
-		rq->dl.dl_nr_migratory--;
-	} else {
-		if (!task_current(rq, p))
-			enqueue_pushable_dl_task(rq, p);
-		rq->dl.dl_nr_migratory++;
-	}
-
-	update_dl_migration(&rq->dl);
-
-done:
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = weight;
+	set_cpus_allowed_common(p, new_mask);
 }
 
 /* Assumes rq->lock is held */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 63692ef..d2ea593 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2076,49 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 		push_rt_tasks(rq);
 }
 
-static void set_cpus_allowed_rt(struct task_struct *p,
-				const struct cpumask *new_mask)
-{
-	struct rq *rq;
-	int weight;
-
-	BUG_ON(!rt_task(p));
-
-	weight = cpumask_weight(new_mask);
-
-	/*
-	 * Only update if the process changes its state from whether it
-	 * can migrate or not.
-	 */
-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
-		goto done;
-
-	if (!task_on_rq_queued(p))
-		goto done;
-
-	rq = task_rq(p);
-
-	/*
-	 * The process used to be able to migrate OR it can now migrate
-	 */
-	if (weight <= 1) {
-		if (!task_current(rq, p))
-			dequeue_pushable_task(rq, p);
-		BUG_ON(!rq->rt.rt_nr_migratory);
-		rq->rt.rt_nr_migratory--;
-	} else {
-		if (!task_current(rq, p))
-			enqueue_pushable_task(rq, p);
-		rq->rt.rt_nr_migratory++;
-	}
-
-	update_rt_migration(&rq->rt);
-
-done:
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = weight;
-}
-
 /* Assumes rq->lock is held */
 static void rq_online_rt(struct rq *rq)
 {
@@ -2327,7 +2284,7 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_rt,
 
-	.set_cpus_allowed       = set_cpus_allowed_rt,
+	.set_cpus_allowed       = set_cpus_allowed_common,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) calling context
  2015-08-12 12:38   ` [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) " tip-bot for Peter Zijlstra
@ 2015-08-13 18:47     ` Sasha Levin
  2015-08-13 20:37       ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Sasha Levin @ 2015-08-13 18:47 UTC (permalink / raw)
  To: hpa, torvalds, efault, tglx, peterz, linux-kernel, mingo,
	linux-tip-commits

On 08/12/2015 08:38 AM, tip-bot for Peter Zijlstra wrote:
> Commit-ID:  6c37067e27867db172b988cc11b9ff921175dee5
> Gitweb:     http://git.kernel.org/tip/6c37067e27867db172b988cc11b9ff921175dee5
> Author:     Peter Zijlstra <peterz@infradead.org>
> AuthorDate: Fri, 15 May 2015 17:43:36 +0200
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Wed, 12 Aug 2015 12:06:10 +0200
> 
> sched: Change the sched_class::set_cpus_allowed() calling context
> 
> Change the calling context of sched_class::set_cpus_allowed() such
> that we can assume the task is inactive.
> 
> This allows us to easily make changes that affect accounting done by
> enqueue/dequeue. This does in fact completely remove
> set_cpus_allowed_rt() and greatly reduces set_cpus_allowed_dl().
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Mike Galbraith <efault@gmx.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: dedekind1@gmail.com
> Cc: juri.lelli@arm.com
> Cc: mgorman@suse.de
> Cc: riel@redhat.com
> Cc: rostedt@goodmis.org
> Link: http://lkml.kernel.org/r/20150515154833.667516139@infradead.org
> Signed-off-by: Ingo Molnar <mingo@kernel.org>

Hey Peter,

This patch breaks boot inside my VM:

[79817.224383] bad: scheduling from the idle thread!
[79817.224900] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6-next-20150813-sasha-00040-g059fd6d #2431
[79817.225709]  ffffffffae0c6320 ffff880051b77c00 ffffffffade7f739 ffff8800a69e15c0
[79817.226419]  ffff880051b77c18 ffffffffa43c4ed3 ffff880051be8000 ffff880051b77c48
[79817.227122]  ffffffffa43a7ffb ffff880051be8000 ffff8800a69e15c0 ffffffffae094d40
[79817.227806] Call Trace:
[79817.228039] dump_stack (lib/dump_stack.c:52)
[79817.228503] dequeue_task_idle (kernel/sched/idle_task.c:45)
[79817.229005] dequeue_task (kernel/sched/core.c:839)
[79817.229485] do_set_cpus_allowed (kernel/sched/core.c:1178 (discriminator 9))
[79817.230006] init_idle (kernel/sched/core.c:4937)
[79817.230467] idle_thread_get (kernel/smpboot.c:35)
[79817.230945] ? cpu_up (kernel/cpu.c:569)
[79817.231374] _cpu_up (kernel/cpu.c:513)
[79817.231801] ? cpu_down (kernel/cpu.c:500)
[79817.232254] ? fork_idle (kernel/fork.c:1688)
[79817.232731] ? find_next_bit (lib/find_bit.c:65)
[79817.233216] ? idle_threads_init (include/linux/cpumask.h:189 kernel/smpboot.c:71)
[79817.233715] cpu_up (kernel/cpu.c:574)
[79817.234132] smp_init (kernel/smp.c:578)
[79817.234557] kernel_init_freeable (init/main.c:690 init/main.c:884 init/main.c:1009)
[79817.235095] ? start_kernel (init/main.c:979)
[79817.235575] ? finish_task_switch (kernel/sched/sched.h:1087 kernel/sched/core.c:2526)
[79817.236106] ? rest_init (init/main.c:934)
[79817.236567] kernel_init (init/main.c:939)
[79817.237020] ? rest_init (init/main.c:934)
[79817.237496] ret_from_fork (arch/x86/entry/entry_64.S:473)
[79817.237994] ? rest_init (init/main.c:934)
[79817.238477] BUG: unable to handle kernel NULL pointer dereference at           (null)
[79817.239170] IP: [< (null)>] null)  (??:?)
[79817.239608] PGD 0
[79817.239802] Oops: 0010 [#1] SMP DEBUG_PAGEALLOC KASAN
[79817.240295] Dumping ftrace buffer:
[79817.240593]    (ftrace buffer empty)
[79817.240899] Modules linked in:
[79817.241180] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6-next-20150813-sasha-00040-g059fd6d #2431
[79817.241957] task: ffff880278228000 ti: ffff880051b70000 task.ti: ffff880051b70000
[79817.242573] RIP: [< (null)>] null)  (??:?)
[79817.243203] RSP: 0000:ffff880051b77c18  EFLAGS: 00010046
[79817.243646] RAX: dffffc0000000000 RBX: ffff880051be8000 RCX: 0000000000000004
[79817.244229] RDX: 0000000000000000 RSI: ffff880051be8000 RDI: ffff8800a69e15c0
[79817.244815] RBP: ffff880051b77c48 R08: 0000000000000001 R09: 0000000000000004
[79817.245404] R10: ffffed00ff815e03 R11: ffffed00ff815e01 R12: ffff8800a69e15c0
[79817.245987] R13: ffffffffae0c6320 R14: 0000000000000000 R15: 00004897e6254646
[79817.246577] FS:  0000000000000000(0000) GS:ffff880052600000(0000) knlGS:0000000000000000
[79817.247243] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[79817.247723] CR2: 0000000000000000 CR3: 0000000030a2d000 CR4: 00000000000006b0
[79817.248319] Stack:
[79817.248496]  ffffffffa43a7d9b ffff880051be8000 ffff8800a69e15c0 ffffffffae0c6320
[79817.249176]  ffff880051be8058 0000000000000001 ffff880051b77c88 ffffffffa43aab66
[79817.249850]  ffff880051be8000 ffff8800a69e15c0 0000000000000001 0000000000000001
[79817.250521] Call Trace:
[79817.250735] ? enqueue_task (kernel/sched/core.c:832)
[79817.251211] do_set_cpus_allowed (kernel/sched/core.c:1189)
[79817.251720] init_idle (kernel/sched/core.c:4937)
[79817.252153] idle_thread_get (kernel/smpboot.c:35)
[79817.252623] ? cpu_up (kernel/cpu.c:569)
[79817.253045] _cpu_up (kernel/cpu.c:513)
[79817.253471] ? cpu_down (kernel/cpu.c:500)
[79817.253902] ? fork_idle (kernel/fork.c:1688)
[79817.254354] ? find_next_bit (lib/find_bit.c:65)
[79817.254820] ? idle_threads_init (include/linux/cpumask.h:189 kernel/smpboot.c:71)
[79817.255326] cpu_up (kernel/cpu.c:574)
[79817.255729] smp_init (kernel/smp.c:578)
[79817.256158] kernel_init_freeable (init/main.c:690 init/main.c:884 init/main.c:1009)
[79817.256675] ? start_kernel (init/main.c:979)
[79817.257158] ? finish_task_switch (kernel/sched/sched.h:1087 kernel/sched/core.c:2526)
[79817.257677] ? rest_init (init/main.c:934)
[79817.258143] kernel_init (init/main.c:939)
[79817.258604] ? rest_init (init/main.c:934)
[79817.259094] ret_from_fork (arch/x86/entry/entry_64.S:473)
[79817.259558] ? rest_init (init/main.c:934)
[79817.260021] Code: Bad RIP value.

Code starting with the faulting instruction
===========================================
[79817.260342] RIP [< (null)>] null)  (??:?)
[79817.260791]  RSP <ffff880051b77c18>
[79817.261098] CR2: 0000000000000000
[79817.261415] ---[ end trace eddc979a4104e4f3 ]---
[79817.261818] Kernel panic - not syncing: Fatal exception


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) calling context
  2015-08-13 18:47     ` Sasha Levin
@ 2015-08-13 20:37       ` Peter Zijlstra
  2015-08-13 20:59         ` Sasha Levin
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-13 20:37 UTC (permalink / raw)
  To: Sasha Levin
  Cc: hpa, torvalds, efault, tglx, linux-kernel, mingo, linux-tip-commits

On Thu, Aug 13, 2015 at 02:47:20PM -0400, Sasha Levin wrote:
> [79817.224383] bad: scheduling from the idle thread!
> [79817.224900] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6-next-20150813-sasha-00040-g059fd6d #2431
> [79817.225709]  ffffffffae0c6320 ffff880051b77c00 ffffffffade7f739 ffff8800a69e15c0
> [79817.226419]  ffff880051b77c18 ffffffffa43c4ed3 ffff880051be8000 ffff880051b77c48
> [79817.227122]  ffffffffa43a7ffb ffff880051be8000 ffff8800a69e15c0 ffffffffae094d40
> [79817.227806] Call Trace:
> [79817.228039] dump_stack (lib/dump_stack.c:52)
> [79817.228503] dequeue_task_idle (kernel/sched/idle_task.c:45)
> [79817.229005] dequeue_task (kernel/sched/core.c:839)
> [79817.229485] do_set_cpus_allowed (kernel/sched/core.c:1178 (discriminator 9))
> [79817.230006] init_idle (kernel/sched/core.c:4937)
> [79817.230467] idle_thread_get (kernel/smpboot.c:35)

Dhurr.. so oddly enough my actual hardware boots without issue.

But I can see how that could go wrong, does the below make your
(virtual) machine happy again?

---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 56aed8fce3cb..2d2871e2cf5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4924,7 +4924,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
-	do_set_cpus_allowed(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) calling context
  2015-08-13 20:37       ` Peter Zijlstra
@ 2015-08-13 20:59         ` Sasha Levin
  2015-08-13 21:30           ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Sasha Levin @ 2015-08-13 20:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: hpa, torvalds, efault, tglx, linux-kernel, mingo, linux-tip-commits

On 08/13/2015 04:37 PM, Peter Zijlstra wrote:
> On Thu, Aug 13, 2015 at 02:47:20PM -0400, Sasha Levin wrote:
>> > [79817.224383] bad: scheduling from the idle thread!
>> > [79817.224900] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6-next-20150813-sasha-00040-g059fd6d #2431
>> > [79817.225709]  ffffffffae0c6320 ffff880051b77c00 ffffffffade7f739 ffff8800a69e15c0
>> > [79817.226419]  ffff880051b77c18 ffffffffa43c4ed3 ffff880051be8000 ffff880051b77c48
>> > [79817.227122]  ffffffffa43a7ffb ffff880051be8000 ffff8800a69e15c0 ffffffffae094d40
>> > [79817.227806] Call Trace:
>> > [79817.228039] dump_stack (lib/dump_stack.c:52)
>> > [79817.228503] dequeue_task_idle (kernel/sched/idle_task.c:45)
>> > [79817.229005] dequeue_task (kernel/sched/core.c:839)
>> > [79817.229485] do_set_cpus_allowed (kernel/sched/core.c:1178 (discriminator 9))
>> > [79817.230006] init_idle (kernel/sched/core.c:4937)
>> > [79817.230467] idle_thread_get (kernel/smpboot.c:35)
> Dhurr.. so oddly enough my actual hardware boots without issue.
> 
> But I can see how that could go wrong, does the below make your
> (virtual) machine happy again?

Seems to work fine now, thanks!


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) calling context
  2015-08-13 20:59         ` Sasha Levin
@ 2015-08-13 21:30           ` Peter Zijlstra
  0 siblings, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-13 21:30 UTC (permalink / raw)
  To: Sasha Levin
  Cc: hpa, torvalds, efault, tglx, linux-kernel, mingo, linux-tip-commits

On Thu, Aug 13, 2015 at 04:59:01PM -0400, Sasha Levin wrote:

> Seems to work fine now, thanks!

I've no clue how this doesn't also explode on actual hardware, that code
is convoluted, read and weep.

Ingo, please stick somewhere appropriate :-)

---
Subject: sched: Avoid trying to dequeue/enqueue the idle thread
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Aug 13 23:09:29 CEST 2015

Sasha reports that his virtual machine tries to schedule the idle
thread since commit 6c37067e2786 ("sched: Change the
sched_class::set_cpus_allowed() calling context").

His trace shows this happening from idle_thread_get()->init_idle(),
which is the _second_ init_idle() invocation on that task_struct, the
first being done through idle_init()->fork_idle(). (this code is
insane...)

Because we call init_idle() twice in a row, its ->sched_class ==
&idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means
do_set_cpus_allowed() thinks we're queued and will call dequeue_task(),
which is implemented with BUG() for the idle class, seeing how
dequeueing the idle task is a daft thing.

Aside of the whole insanity of calling init_idle() _twice_, change the
code to call set_cpus_allowed_common() instead as this is 'obviously'
before the idle task gets ran etc..

Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context")
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4924,7 +4924,7 @@ void init_idle(struct task_struct *idle,
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
-	do_set_cpus_allowed(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional
  2015-05-15 15:43 ` [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional Peter Zijlstra
  2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
@ 2015-08-20 16:45   ` Sasha Levin
  1 sibling, 0 replies; 32+ messages in thread
From: Sasha Levin @ 2015-08-20 16:45 UTC (permalink / raw)
  To: Peter Zijlstra, mingo, riel
  Cc: dedekind1, linux-kernel, mgorman, rostedt, juri.lelli

On 05/15/2015 11:43 AM, Peter Zijlstra wrote:
> Give every class a set_cpus_allowed() method, this enables some small
> optimization in the rt,dl implementation by avoiding a double
> cpumask_weight() call.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Hey Peter,

Here's the splat I mentioned earlier. I'm not sure if it's caused by this commit
or not, it's just that git blame took me here.

[392896.406248] WARNING: CPU: 9 PID: 51 at kernel/sched/core.c:1178 do_set_cpus_allowed+0x1cf/0x480()
[392896.407070] Modules linked in:
[392896.407408] CPU: 9 PID: 51 Comm: migration/9 Not tainted 4.2.0-rc6-next-20150817-sasha-00041-g9b6b2ab-dirty #2460
[392896.408376]  ffffffffab2c2ac0 ffff880477857948 ffffffffaafaf5a9 0000000000000000
[392896.409126]  ffff880477857988 ffffffffa1323246 ffffffffa13ab19f ffff880477284000
[392896.409874]  ffff8804781e15c0 ffff88006bc03d98 ffffffffae480d20 0000000000000001
[392896.410610] Call Trace:
[392896.410947] dump_stack (lib/dump_stack.c:52)
[392896.411460] warn_slowpath_common (kernel/panic.c:448)
[392896.412037] ? do_set_cpus_allowed (kernel/sched/sched.h:1050 (discriminator 9) kernel/sched/core.c:1180 (discriminator 9))
[392896.412629] warn_slowpath_null (kernel/panic.c:482)
[392896.413183] do_set_cpus_allowed (kernel/sched/sched.h:1050 (discriminator 9) kernel/sched/core.c:1180 (discriminator 9))
[392896.413778] cpuset_cpus_allowed_fallback (include/linux/rcupdate.h:911 kernel/cpuset.c:2381)
[392896.414415] ? cpuset_cpus_allowed_fallback (kernel/cpuset.c:2378)
[392896.415055] select_fallback_rq (kernel/sched/core.c:1589)
[392896.415636] ? put_prev_entity (kernel/sched/fair.c:3218)
[392896.416234] migration_call (kernel/sched/core.c:5230 kernel/sched/core.c:5492)
[392896.416786] notifier_call_chain (kernel/notifier.c:93)
[392896.417359] ? cpu_notify (kernel/cpu.c:334)
[392896.417863] __raw_notifier_call_chain (kernel/notifier.c:395)
[392896.419027] cpu_notify (include/linux/notifier.h:179 kernel/cpu.c:231 kernel/cpu.c:236)
[392896.419616] take_cpu_down (kernel/cpu.c:345)
[392896.420127] multi_cpu_stop (kernel/stop_machine.c:203)
[392896.420671] ? queue_stop_cpus_work (kernel/stop_machine.c:172)
[392896.421275] cpu_stopper_thread (kernel/stop_machine.c:440)
[392896.421849] ? cpu_stop_create (kernel/stop_machine.c:415)
[392896.422404] ? lock_release (kernel/locking/lockdep.c:3643)
[392896.422936] ? __raw_callee_save___pv_queued_spin_unlock (??:?)
[392896.423710] ? _raw_spin_unlock_irqrestore (./arch/x86/include/asm/paravirt.h:802 include/linux/spinlock_api_smp.h:162 kernel/locking/spinlock.c:191)
[392896.424354] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2594 kernel/locking/lockdep.c:2636)
[392896.424978] ? _raw_spin_unlock_irqrestore (./arch/x86/include/asm/preempt.h:77 include/linux/spinlock_api_smp.h:163 kernel/locking/spinlock.c:191)
[392896.425610] ? cpu_stop_create (kernel/stop_machine.c:415)
[392896.426152] smpboot_thread_fn (kernel/smpboot.c:163 (discriminator 1))
[392896.426707] ? sort_range (kernel/smpboot.c:106)
[392896.427252] ? __kthread_parkme (kernel/kthread.c:165)
[392896.427817] ? sort_range (kernel/smpboot.c:106)
[392896.428332] kthread (kernel/kthread.c:209)
[392896.428862] ? kthread_stop (kernel/kthread.c:178)
[392896.429399] ? lock_release (kernel/locking/lockdep.c:3643)
[392896.429977] ? wait_for_completion (kernel/sched/completion.c:77 kernel/sched/completion.c:93 kernel/sched/completion.c:101 kernel/sched/completion.c:122)


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2015-08-20 16:46 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-15 15:43 [RFC][PATCH 0/4] sched,numa: pinned task accounting Peter Zijlstra
2015-05-15 15:43 ` [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity() Peter Zijlstra
2015-05-15 15:56   ` Tejun Heo
2015-08-07 14:27     ` Peter Zijlstra
2015-08-07 15:16       ` Tejun Heo
2015-08-07 15:29         ` Peter Zijlstra
2015-08-07 15:38           ` Tejun Heo
2015-08-07 15:59             ` Peter Zijlstra
2015-08-07 16:09               ` Tejun Heo
2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2015-05-15 15:43 ` [RFC][PATCH 2/4] sched: Make sched_class::set_cpus_allowed() unconditional Peter Zijlstra
2015-08-12 12:38   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2015-08-20 16:45   ` [RFC][PATCH 2/4] " Sasha Levin
2015-05-15 15:43 ` [RFC][PATCH 3/4] sched: Change sched_class::set_cpus_allowed calling context Peter Zijlstra
     [not found]   ` <OF66BF3765.2EBFD3B1-ON48257E49.0028DC79-48257E49.0029F058@zte.com.cn>
2015-05-18  8:32     ` Peter Zijlstra
2015-05-18  9:34       ` Juri Lelli
2015-05-18 20:04       ` Peter Zijlstra
2015-08-12 12:38   ` [tip:sched/core] sched: Change the sched_class::set_cpus_allowed( ) " tip-bot for Peter Zijlstra
2015-08-13 18:47     ` Sasha Levin
2015-08-13 20:37       ` Peter Zijlstra
2015-08-13 20:59         ` Sasha Levin
2015-08-13 21:30           ` Peter Zijlstra
2015-05-15 15:43 ` [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks Peter Zijlstra
2015-05-15 19:05   ` Rik van Riel
2015-05-16  9:31   ` Peter Zijlstra
2015-05-18 13:00   ` Srikar Dronamraju
2015-05-18 13:06     ` Peter Zijlstra
2015-05-18 14:13       ` Rik van Riel
2015-05-18 14:29       ` Srikar Dronamraju
2015-05-18 15:09         ` Peter Zijlstra
2015-05-18 13:10   ` Srikar Dronamraju
2015-05-18  9:08 ` [RFC][PATCH 0/4] sched,numa: pinned task accounting Artem Bityutskiy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.