All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 00/19] sched: Migrate disable support
@ 2020-10-15 11:05 Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
                   ` (18 more replies)
  0 siblings, 19 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Hi,

Third version of migrate_disable()... hopefully getting to the point where we're all happy with it.

---
 fs/proc/array.c               |    4 
 include/linux/cpuhotplug.h    |    1 
 include/linux/cpumask.h       |    6 
 include/linux/preempt.h       |   64 +++
 include/linux/sched.h         |    5 
 include/linux/sched/hotplug.h |    2 
 include/linux/stop_machine.h  |    5 
 include/trace/events/sched.h  |   12 
 kernel/cpu.c                  |    9 
 kernel/sched/core.c           |  852 +++++++++++++++++++++++++++++++++---------
 kernel/sched/cpudeadline.c    |    4 
 kernel/sched/cpupri.c         |    4 
 kernel/sched/deadline.c       |   47 +-
 kernel/sched/rt.c             |   81 +++
 kernel/sched/sched.h          |   59 ++
 kernel/stop_machine.c         |   23 -
 kernel/workqueue.c            |    4 
 lib/cpumask.c                 |   18 
 lib/dump_stack.c              |    2 
 lib/smp_processor_id.c        |    5 
 20 files changed, 985 insertions(+), 222 deletions(-)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 01/19] stop_machine: Add function and caller debug info
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 02/19] sched: Fix balance_callback() Peter Zijlstra
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/stop_machine.h |    5 +++++
 kernel/stop_machine.c        |   23 ++++++++++++++++++++---
 lib/dump_stack.c             |    2 ++
 3 files changed, 27 insertions(+), 3 deletions(-)

--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(u
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,23 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper);
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +135,7 @@ static bool cpu_stop_queue_work(unsigned
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +343,8 @@ int stop_two_cpus(unsigned int cpu1, uns
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +380,7 @@ int stop_two_cpus(unsigned int cpu1, uns
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +500,8 @@ static void cpu_stopper_thread(unsigned
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +510,8 @@ static void cpu_stopper_thread(unsigned
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *l
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 02/19] sched: Fix balance_callback()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
                   ` (16 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  119 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    3 +
 2 files changed, 78 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3494,6 +3494,69 @@ static inline void finish_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3519,6 +3582,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3660,43 +3724,6 @@ static struct rq *finish_task_switch(str
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3716,7 +3743,6 @@ asmlinkage __visible void schedule_tail(
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4532,10 +4558,11 @@ static void __sched notrace __schedule(b
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4946,9 +4973,11 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5222,6 +5251,7 @@ static int __sched_setscheduler(struct t
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5460,6 +5490,7 @@ static int __sched_setscheduler(struct t
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5468,7 +5499,7 @@ static int __sched_setscheduler(struct t
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
 #endif
+#ifdef CONFIG_SMP
+	SCHED_WARN_ON(rq->balance_callback);
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 02/19] sched: Fix balance_callback() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
                   ` (15 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    7 ++-
 2 files changed, 118 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3509,8 +3509,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3531,6 +3533,21 @@ static inline void balance_callbacks(str
 	}
 }
 
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (likely(!rq->balance_flags))
+		return;
+
+	if (rq->balance_flags & BALANCE_PUSH) {
+		balance_push(rq);
+		return;
+	}
+
+	__balance_callbacks(rq);
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3546,6 +3563,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3573,7 +3594,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6831,6 +6852,90 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static void balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6916,6 +7021,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6963,6 +7070,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6976,6 +7085,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1385,6 +1386,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq,
 {
 	lockdep_assert_held(&rq->lock);
 
-	if (unlikely(head->next))
+	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
 		return;
 
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 04/19] sched/core: Wait for tasks being pushed away on hotplug
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 05/19] workqueue: Manually break affinity " Peter Zijlstra
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

RT kernels need to ensure that all tasks which are not per CPU kthreads
have left the outgoing CPU to guarantee that no tasks are force migrated
within a migrate disabled section.

There is also some desire to (ab)use fine grained CPU hotplug control to
clear a CPU from active state to force migrate tasks which are not per CPU
kthreads away for power control purposes.

Add a mechanism which waits until all tasks which should leave the CPU
after the CPU active flag is cleared have moved to a different online CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   40 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    4 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6894,8 +6894,21 @@ static void balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task))
+	if (is_per_cpu_kthread(push_task)) {
+		/*
+		 * If this is the idle task on the outgoing CPU try to wake
+		 * up the hotplug control thread which might wait for the
+		 * last task to vanish. The rcuwait_active() check is
+		 * accurate here because the waiter is pinned on this CPU
+		 * and can't obviously be running in parallel.
+		 */
+		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+			raw_spin_unlock(&rq->lock);
+			rcuwait_wake_up(&rq->hotplug_wait);
+			raw_spin_lock(&rq->lock);
+		}
 		return;
+	}
 
 	get_task_struct(push_task);
 	/*
@@ -6926,6 +6939,20 @@ static void balance_push_set(int cpu, bo
 	rq_unlock_irqrestore(rq, &rf);
 }
 
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+	struct rq *rq = this_rq();
+
+	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+			   TASK_UNINTERRUPTIBLE);
+}
+
 #else
 
 static inline void balance_push(struct rq *rq)
@@ -6936,6 +6963,10 @@ static inline void balance_push_set(int
 {
 }
 
+static inline void balance_hotplug_wait(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7090,6 +7121,10 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+
+	/* Wait for all non per CPU kernel threads to vanish. */
+	balance_hotplug_wait();
+
 	return 0;
 }
 
@@ -7330,6 +7365,9 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		rcuwait_init(&rq->hotplug_wait);
+#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,6 +1004,10 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct rcuwait		hotplug_wait;
+#endif
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 05/19] workqueue: Manually break affinity on hotplug
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
                   ` (13 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Don't rely on the scheduler to force break affinity for us -- it will
stop doing that for per-cpu-kthreads.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c |    4 ++++
 1 file changed, 4 insertions(+)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4905,6 +4905,10 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		raw_spin_unlock_irq(&pool->lock);
+
+		for_each_pool_worker(worker, pool)
+			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
+
 		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 06/19] sched/hotplug: Consolidate task migration on CPU unplug
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 05/19] workqueue: Manually break affinity " Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
                   ` (12 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpuhotplug.h    |    1 
 include/linux/sched/hotplug.h |    2 
 kernel/cpu.c                  |    9 ++
 kernel/sched/core.c           |  154 +++++++++---------------------------------
 4 files changed, 46 insertions(+), 120 deletions(-)

--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -151,6 +151,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned i
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6759,120 +6759,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7128,10 +7014,6 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
-
-	/* Wait for all non per CPU kernel threads to vanish. */
-	balance_hotplug_wait();
-
 	return 0;
 }
 
@@ -7151,6 +7033,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7164,7 +7081,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 07/19] sched: Fix hotplug vs CPU bandwidth control
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
                   ` (11 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Since we now migrate tasks away before DYING, we should also move
bandwidth unthrottle, otherwise we can gain tasks from unthrottle
after we expect all tasks to be gone already.

Also; it looks like the RT balancers don't respect cpu_active() and
instead rely on rq->online in part, complete this. This too requires
we do set_rq_offline() earlier to match the cpu_active() semantics.
(The bigger patch is to convert RT to cpu_active() entirely)

Since set_rq_online() is called from sched_cpu_activate(), place
set_rq_offline() in sched_cpu_deactivate().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   14 ++++++++++----
 kernel/sched/deadline.c |    5 +----
 kernel/sched/rt.c       |    5 +----
 3 files changed, 12 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6979,6 +6979,8 @@ int sched_cpu_activate(unsigned int cpu)
 
 int sched_cpu_deactivate(unsigned int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
 	int ret;
 
 	set_cpu_active(cpu, false);
@@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cp
 
 	balance_push_set(cpu, true);
 
+	rq_lock_irqsave(rq, &rf);
+	if (rq->rd) {
+		update_rq_clock(rq);
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -7074,10 +7084,6 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq);
 
 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 {
-	return dl_task(prev);
+	return rq->online && dl_task(prev);
 }
 
 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	return rq->online && rq->rt.highest_prio.curr > prev->prio;
 }
 
 static inline int rt_overloaded(struct rq *rq)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 08/19] sched: Massage set_cpus_allowed()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 09/19] sched: Add migrate_disable() Peter Zijlstra
                   ` (10 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Thread a u32 flags word through the *set_cpus_allowed*() callchain.
This will allow adding behavioural tweaks for future users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   28 ++++++++++++++++++----------
 kernel/sched/deadline.c |    5 +++--
 kernel/sched/sched.h    |    7 +++++--
 3 files changed, 26 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1828,13 +1828,14 @@ static int migration_cpu_stop(void *data
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -1855,7 +1856,7 @@ void do_set_cpus_allowed(struct task_str
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask);
+	p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1863,6 +1864,11 @@ void do_set_cpus_allowed(struct task_str
 		set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1873,7 +1879,8 @@ void do_set_cpus_allowed(struct task_str
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
+				  const struct cpumask *new_mask,
+				  u32 flags)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
@@ -1895,7 +1902,7 @@ static int __set_cpus_allowed_ptr(struct
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1914,7 +1921,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	do_set_cpus_allowed(p, new_mask);
+	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -1951,7 +1958,7 @@ static int __set_cpus_allowed_ptr(struct
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, false);
+	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2410,7 +2417,8 @@ void sched_set_stop_task(int cpu, struct
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
+					 const struct cpumask *new_mask,
+					 u32 flags)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -6025,7 +6033,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -6608,7 +6616,7 @@ void init_idle(struct task_struct *idle,
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2280,7 +2280,8 @@ static void task_woken_dl(struct rq *rq,
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
-				const struct cpumask *new_mask)
+				const struct cpumask *new_mask,
+				u32 flags)
 {
 	struct root_domain *src_rd;
 	struct rq *rq;
@@ -2309,7 +2310,7 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	set_cpus_allowed_common(p, new_mask);
+	set_cpus_allowed_common(p, new_mask, flags);
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1806,7 +1806,8 @@ struct sched_class {
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
+				 const struct cpumask *newmask,
+				 u32 flags);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#define SCA_CHECK		0x01
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 09/19] sched: Add migrate_disable()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
                   ` (9 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   60 +++++++++++++++++++++++++
 include/linux/sched.h   |    3 +
 kernel/sched/core.c     |  112 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |    6 +-
 lib/smp_processor_id.c  |    5 ++
 5 files changed, 178 insertions(+), 8 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,64 @@ static inline void preempt_notifier_init
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#else /* !(CONFIG_SMP && CONFIG_PREEMPT_RT) */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +410,6 @@ static __always_inline void migrate_enab
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1700,6 +1700,61 @@ void check_preempt_curr(struct rq *rq, s
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+	if (--p->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1709,7 +1764,7 @@ static inline bool is_cpu_allowed(struct
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1830,6 +1885,11 @@ static int migration_cpu_stop(void *data
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+		p->cpus_ptr = new_mask;
+		return;
+	}
+
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1840,7 +1900,22 @@ __do_set_cpus_allowed(struct task_struct
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1891,9 +1966,14 @@ static int __set_cpus_allowed_ptr(struct
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_any_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1907,7 +1987,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1999,6 +2079,8 @@ void set_task_cpu(struct task_struct *p,
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2329,6 +2411,12 @@ static int select_fallback_rq(int cpu, s
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -2363,7 +2451,7 @@ int select_task_rq(struct task_struct *p
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -2425,6 +2513,17 @@ static inline int __set_cpus_allowed_ptr
 
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -4589,6 +4688,7 @@ static void __sched notrace __schedule(b
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1895,14 +1895,16 @@ static inline bool sched_fair_runnable(s
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-#define SCA_CHECK		0x01
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(c
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 09/19] sched: Add migrate_disable() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 13:54   ` Valentin Schneider
  2020-10-16 12:48   ` Valentin Schneider
  2020-10-15 11:05 ` [PATCH v3 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
                   ` (8 subsequent siblings)
  18 siblings, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    1 
 kernel/sched/core.c   |  161 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 139 insertions(+), 23 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1731,15 +1731,26 @@ void migrate_enable(void)
 {
 	struct task_struct *p = current;
 
-	if (--p->migration_disabled)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
 	barrier();
-
-	if (p->cpus_ptr == &p->cpus_mask)
-		return;
-
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1806,6 +1817,7 @@ static struct rq *move_queued_task(struc
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
+	struct completion *done;
 };
 
 /*
@@ -1840,6 +1852,7 @@ static int migration_cpu_stop(void *data
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
@@ -1862,15 +1875,27 @@ static int migration_cpu_stop(void *data
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
 		if (task_on_rq_queued(p))
 			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
 		else
 			p->wake_cpu = arg->dest_cpu;
+
+		if (arg->done) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
 	}
+out:
 	rq_unlock(rq, &rf);
 	raw_spin_unlock(&p->pi_lock);
-
 	local_irq_enable();
+
+	if (complete)
+		complete_all(arg->done);
+
 	return 0;
 }
 
@@ -1939,6 +1964,111 @@ void do_set_cpus_allowed(struct task_str
 	__do_set_cpus_allowed(p, new_mask, 0);
 }
 
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
+};
+
+/*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct rq_flags *rf,
+			    struct task_struct *p, int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending = { }, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+	bool complete = false;
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		pending = p->migration_pending;
+		if (pending) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			goto do_complete;
+
+		return 0;
+	}
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending))
+		return -EINVAL;
+
+	arg.done = &pending->done;
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		task_rq_unlock(rq, p, rf);
+		pending->arg = arg;
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+do_complete:
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -2008,23 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
+	return affine_move_task(rq, &rf, p, dest_cpu, flags);
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
 out:
 	task_rq_unlock(rq, p, &rf);
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (9 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
       [not found]   ` <BN8PR12MB2978A3BB8062DF4CEEB184109A030@BN8PR12MB2978.namprd12.prod.outlook.com>
  2020-10-15 11:05 ` [PATCH v3 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
                   ` (7 subsequent siblings)
  18 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   36 ++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |    4 ++++
 2 files changed, 34 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1725,10 +1725,17 @@ static void migrate_disable_switch(struc
 
 void migrate_disable(void)
 {
-	if (current->migration_disabled++)
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+		p->migration_disabled++;
 		return;
+	}
 
-	barrier();
+	preempt_disable();
+	this_rq()->nr_pinned++;
+	p->migration_disabled = 1;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -1755,6 +1762,7 @@ void migrate_enable(void)
 	 */
 	barrier();
 	p->migration_disabled = 0;
+	this_rq()->nr_pinned--;
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1764,6 +1772,11 @@ static inline bool is_migration_disabled
 	return p->migration_disabled;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return rq->nr_pinned;
+}
+
 #endif
 
 /*
@@ -2634,6 +2647,11 @@ static inline bool is_migration_disabled
 	return false;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return false;
+}
+
 #endif
 
 static void
@@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task)) {
+	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the
 		 * last task to vanish. The rcuwait_active() check is
 		 * accurate here because the waiter is pinned on this CPU
 		 * and can't obviously be running in parallel.
+		 *
+		 * On RT kernels this also has to check whether there are
+		 * pinned and scheduled out tasks on the runqueue. They
+		 * need to leave the migrate disabled section first.
 		 */
-		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+		    rcuwait_active(&rq->hotplug_wait)) {
 			raw_spin_unlock(&rq->lock);
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
@@ -7063,7 +7086,8 @@ static void balance_hotplug_wait(void)
 {
 	struct rq *rq = this_rq();
 
-	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+	rcuwait_wait_event(&rq->hotplug_wait,
+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
 			   TASK_UNINTERRUPTIBLE);
 }
 
@@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1);
+	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	unsigned int		nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 12/19] sched,rt: Use cpumask_any*_distribute()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (10 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
                   ` (6 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Replace a bunch of cpumask_any*() instances with
cpumask_any*_distribute(), by injecting this little bit of random in
cpu selection, we reduce the chance two competing balance operations
working off the same lowest_mask pick the same CPU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpumask.h |    6 ++++++
 kernel/sched/deadline.c |    6 +++---
 kernel/sched/rt.c       |    6 +++---
 lib/cpumask.c           |   18 ++++++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distri
 	return cpumask_next_and(-1, src1p, src2p);
 }
 
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1981,8 +1981,8 @@ static int find_later_rq(struct task_str
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
@@ -2004,7 +2004,7 @@ static int find_later_rq(struct task_str
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(later_mask);
+	cpu = cpumask_any_distribute(later_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_st
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(lowest_mask,
-						     sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(lowest_mask,
+							      sched_domain_span(sd));
 			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_st
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(lowest_mask);
+	cpu = cpumask_any_distribute(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const str
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
+
+int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next(prev, srcp);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first(srcp);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_distribute);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 13/19] sched,rt: Use the full cpumask for balancing
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (11 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

We want migrate_disable() tasks to get PULLs in order for them to PUSH
away the higher priority task.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/cpudeadline.c |    4 ++--
 kernel/sched/cpupri.c      |    4 ++--
 kernel/sched/deadline.c    |    4 ++--
 kernel/sched/rt.c          |    4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
 		unsigned long cap, max_cap = 0;
 		int cpu, max_cpu = -1;
 
@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct
 
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+		if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 			if (later_mask)
 				cpumask_set_cpu(best_cpu, later_mask);
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct c
 	if (skip)
 		return 0;
 
-	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
 		return 0;
 
 	if (lowest_mask) {
-		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
 
 		/*
 		 * We have to ensure that we have at least one bit
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1888,7 +1888,7 @@ static void task_fork_dl(struct task_str
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 	return 0;
 }
@@ -2038,7 +2038,7 @@ static struct rq *find_lock_later_rq(str
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 
 	return 0;
@@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(st
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 14/19] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (12 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
                   ` (4 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

There's a valid ->pi_lock recursion issue where the actual PI code
tries to wake up the stop task. Make lockdep aware so it doesn't
complain about this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   15 +++++++++++++++
 1 file changed, 15 insertions(+)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2602,6 +2602,7 @@ int select_task_rq(struct task_struct *p
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
+	static struct lock_class_key stop_pi_lock;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
 		stop->sched_class = &stop_sched_class;
+
+		/*
+		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+		 * adjust the effective priority of a task. As a result,
+		 * rt_mutex_setprio() can trigger (RT) balancing operations,
+		 * which can then trigger wakeups of the stop thread to push
+		 * around the current task.
+		 *
+		 * The stop task itself will never be part of the PI-chain, it
+		 * never blocks, therefore that ->pi_lock recursion is safe.
+		 * Tell lockdep about this by placing the stop->pi_lock in its
+		 * own class.
+		 */
+		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
 	}
 
 	cpu_rq(cpu)->stop = stop;



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (13 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
                   ` (3 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   38 +++++++++++++++------------
 include/linux/sched.h   |    3 +-
 kernel/sched/core.c     |   67 ++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c |   29 +++++++++++++++-----
 kernel/sched/rt.c       |   63 ++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h    |   32 ++++++++++++++++++++++
 6 files changed, 185 insertions(+), 47 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -296,24 +296,28 @@ static inline void preempt_notifier_init
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 
 /*
- * Migrate-Disable and why it is (strongly) undesired.
+ * Migrate-Disable and why it is undesired.
  *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
  *
  *
  * The reason we have it anyway.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -715,8 +715,9 @@ struct task_struct {
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-	int				migration_disabled;
+	unsigned short			migration_disabled;
 #endif
+	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return p->migration_disabled;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return rq->nr_pinned;
@@ -1913,6 +1908,49 @@ static int migration_cpu_stop(void *data
 	return 0;
 }
 
+int push_cpu_stop(void *arg)
+{
+	struct rq *lowest_rq = NULL, *rq = this_rq();
+	struct task_struct *p = arg;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+
+	if (task_rq(p) != rq)
+		goto out_unlock;
+
+	if (is_migration_disabled(p)) {
+		p->migration_flags |= MDF_PUSH;
+		goto out_unlock;
+	}
+
+	p->migration_flags &= ~MDF_PUSH;
+
+	if (p->sched_class->find_lock_rq)
+		lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+	if (!lowest_rq)
+		goto out_unlock;
+
+	// XXX validate p is still the highest prio task
+	if (task_rq(p) == rq) {
+		deactivate_task(rq, p, 0);
+		set_task_cpu(p, lowest_rq->cpu);
+		activate_task(lowest_rq, p, 0);
+		resched_curr(lowest_rq);
+	}
+
+	double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+	rq->push_busy = false;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+	return 0;
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2000,6 +2038,14 @@ static int affine_move_task(struct rq *r
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		struct task_struct *push_task = NULL;
+
+		if ((flags & SCA_MIGRATE_ENABLE) &&
+		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+			rq->push_busy = true;
+			push_task = get_task_struct(p);
+		}
+
 		pending = p->migration_pending;
 		if (pending) {
 			p->migration_pending = NULL;
@@ -2007,6 +2053,11 @@ static int affine_move_task(struct rq *r
 		}
 		task_rq_unlock(rq, p, rf);
 
+		if (push_task) {
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    p, &rq->push_work);
+		}
+
 		if (complete)
 			goto do_complete;
 
@@ -2044,6 +2095,7 @@ static int affine_move_task(struct rq *r
 
 	if (flags & SCA_MIGRATE_ENABLE) {
 
+		p->migration_flags &= ~MDF_PUSH;
 		task_rq_unlock(rq, p, rf);
 		pending->arg = arg;
 		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
@@ -2656,11 +2708,6 @@ static inline int __set_cpus_allowed_ptr
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return false;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return false;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task))
+		return 0;
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
 static void pull_dl_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this
 			continue;
 
 		/* Might drop this_rq->lock */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
-			dmin = p->dl.deadline;
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				dmin = p->dl.deadline;
+				resched = true;
+			}
 
 			/* Is there any other task even earlier? */
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
+	.find_lock_rq		= find_lock_later_rq,
 #endif
 
 	.task_tick		= task_tick_dl,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task)) {
+		struct task_struct *push_task = NULL;
+		int cpu;
+
+		if (!pull || rq->push_busy)
+			return 0;
+
+		cpu = find_lowest_rq(rq->curr);
+		if (cpu == -1 || cpu == rq->cpu)
+			return 0;
+
+		/*
+		 * Given we found a CPU with lower priority than @next_task,
+		 * therefore it should be running. However we cannot migrate it
+		 * to this other CPU, instead attempt to push the current
+		 * running task on this CPU away.
+		 */
+		push_task = get_push_task(rq);
+		if (push_task) {
+			raw_spin_unlock(&rq->lock);
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    push_task, &rq->push_work);
+			raw_spin_lock(&rq->lock);
+		}
+
+		return 0;
+	}
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -1927,12 +1955,10 @@ static int push_rt_task(struct rq *rq)
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	ret = 1;
-
 	resched_curr(lowest_rq);
+	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
-
 out:
 	put_task_struct(next_task);
 
@@ -1942,7 +1968,7 @@ static int push_rt_task(struct rq *rq)
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
-	while (push_rt_task(rq))
+	while (push_rt_task(rq, false))
 		;
 }
 
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_w
 	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_tasks(rq);
+		while (push_rt_task(rq, true))
+			;
 		raw_spin_unlock(&rq->lock);
 	}
 
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this
 {
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	struct rq *src_rq;
 	int rt_overload_count = rt_overloaded(this_rq);
 
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this
 		 * double_lock_balance, and another CPU could
 		 * alter this_rq
 		 */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				resched = true;
+			}
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
+	.find_lock_rq		= find_lock_lowest_rq,
 #endif
 
 	.task_tick		= task_tick_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	unsigned int		nr_pinned;
 #endif
+	unsigned int		push_busy;
+	struct cpu_stop_work	push_work;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define MDF_PUSH	0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	return p->migration_disabled;
+#else
+	return false;
+#endif
+}
 
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
@@ -1816,6 +1828,8 @@ struct sched_class {
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1911,6 +1925,24 @@ extern void trigger_load_balance(struct
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->push_busy)
+		return NULL;
+
+	if (p->nr_cpus_allowed == 1)
+		return NULL;
+
+	rq->push_busy = true;
+	return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
 #endif
 
 #ifdef CONFIG_CPU_IDLE



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 16/19] sched/proc: Print accurate cpumask vs migrate_disable()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (14 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Ensure /proc/*/status doesn't print 'random' cpumasks due to
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/proc/array.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -382,9 +382,9 @@ static inline void task_context_switch_c
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 17/19] sched: Add migrate_disable() tracepoints
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (15 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 19/19] sched: Comment affine_move_task() Peter Zijlstra
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

XXX write a tracer:

 - 'migirate_disable() -> migrate_enable()' time in task_sched_runtime()
 - 'migrate_pull -> sched-in' time in task_sched_runtime()

The first will give worst case for the second, which is the actual
interference experienced by the task to due migration constraints of
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/trace/events/sched.h |   12 ++++++++++++
 kernel/sched/core.c          |    4 ++++
 kernel/sched/deadline.c      |    1 +
 kernel/sched/rt.c            |    8 +++++++-
 4 files changed, 24 insertions(+), 1 deletion(-)

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -646,6 +646,18 @@ DECLARE_TRACE(sched_update_nr_running_tp
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_migrate_disable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_enable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_pull_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,6 +1732,8 @@ void migrate_disable(void)
 		return;
 	}
 
+	trace_sched_migrate_disable_tp(p);
+
 	preempt_disable();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
@@ -1764,6 +1766,8 @@ void migrate_enable(void)
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
 	preempt_enable();
+
+	trace_sched_migrate_enable_tp(p);
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2245,6 +2245,7 @@ static void pull_dl_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1877,7 +1877,12 @@ static int push_rt_task(struct rq *rq, b
 		struct task_struct *push_task = NULL;
 		int cpu;
 
-		if (!pull || rq->push_busy)
+		if (!pull)
+			return 0;
+
+		trace_sched_migrate_pull_tp(next_task);
+
+		if (rq->push_busy)
 			return 0;
 
 		cpu = find_lowest_rq(rq->curr);
@@ -2223,6 +2228,7 @@ static void pull_rt_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (16 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  2020-10-15 11:05 ` [PATCH v3 19/19] sched: Comment affine_move_task() Peter Zijlstra
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Valentin Schneider <valentin.schneider@arm.com>

  migrate_disable();
  set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
  affine_move_task(); <-- never returns

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com
---
 kernel/sched/core.c |   13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2182,8 +2182,17 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
-		goto out;
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		if (cpumask_equal(&p->cpus_mask, new_mask))
+			goto out;
+
+		if (WARN_ON_ONCE(p == current &&
+				 is_migration_disabled(p) &&
+				 !cpumask_test_cpu(task_cpu(p), new_mask))) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
 
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 19/19] sched: Comment affine_move_task()
  2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (17 preceding siblings ...)
  2020-10-15 11:05 ` [PATCH v3 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
@ 2020-10-15 11:05 ` Peter Zijlstra
  18 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 11:05 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Valentin Schneider <valentin.schneider@arm.com>


Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com
---
 kernel/sched/core.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2028,7 +2028,75 @@ struct set_affinity_pending {
 };
 
 /*
- * This function is wildly self concurrent, consider at least 3 times.
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded an uninstallion of
+ * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 			    struct task_struct *p, int dest_cpu, unsigned int flags)
@@ -2071,6 +2139,7 @@ static int affine_move_task(struct rq *r
 	if (!(flags & SCA_MIGRATE_ENABLE)) {
 		/* serialized by p->pi_lock */
 		if (!p->migration_pending) {
+			/* Install the request */
 			refcount_set(&my_pending.refs, 1);
 			init_completion(&my_pending.done);
 			p->migration_pending = &my_pending;
@@ -2109,7 +2178,11 @@ static int affine_move_task(struct rq *r
 	}
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
-
+		/*
+		 * Lessen races (and headaches) by delegating
+		 * is_migration_disabled(p) checks to the stopper, which will
+		 * run on the same CPU as said p.
+		 */
 		task_rq_unlock(rq, p, rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 
@@ -2134,6 +2207,10 @@ static int affine_move_task(struct rq *r
 	if (refcount_dec_and_test(&pending->refs))
 		wake_up_var(&pending->refs);
 
+	/*
+	 * Block the original owner of &pending until all subsequent callers
+	 * have seen the completion and decremented the refcount
+	 */
 	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
 
 	return 0;



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-15 11:05 ` [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-10-15 13:54   ` Valentin Schneider
  2020-10-15 14:19     ` Peter Zijlstra
  2020-10-16 12:48   ` Valentin Schneider
  1 sibling, 1 reply; 27+ messages in thread
From: Valentin Schneider @ 2020-10-15 13:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 15/10/20 12:05, Peter Zijlstra wrote:
> +static int affine_move_task(struct rq *rq, struct rq_flags *rf,
> +			    struct task_struct *p, int dest_cpu, unsigned int flags)
> +{
> +	struct set_affinity_pending my_pending = { }, *pending = NULL;
> +	struct migration_arg arg = {
> +		.task = p,
> +		.dest_cpu = dest_cpu,
> +	};
> +	bool complete = false;
> +
> +	/* Can the task run on the task's current CPU? If so, we're done */
> +	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
> +		pending = p->migration_pending;
> +		if (pending) {
> +			p->migration_pending = NULL;
> +			complete = true;

Deciphering my TLA+ deadlock traces leads me to think this needs

                        refcount_inc(&pending->refs);

because the 'goto do_complete' leads us to an unconditional decrement.

> +		}
> +		task_rq_unlock(rq, p, rf);
> +
> +		if (complete)
> +			goto do_complete;
                        ^^^^
                       that here

> +
> +		return 0;
> +	}
> +

[...]

> +do_complete:
> +		if (complete)
> +			complete_all(&pending->done);
> +	}
> +
> +	wait_for_completion(&pending->done);
> +
> +	if (refcount_dec_and_test(&pending->refs))
           ^^^^^^^^^^^^^^^^^^^^^^^
           leads to this guy there

> +		wake_up_var(&pending->refs);
> +
> +	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
> +
> +	return 0;
> +}

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-15 13:54   ` Valentin Schneider
@ 2020-10-15 14:19     ` Peter Zijlstra
  0 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-15 14:19 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 15, 2020 at 02:54:53PM +0100, Valentin Schneider wrote:
> 
> On 15/10/20 12:05, Peter Zijlstra wrote:
> > +static int affine_move_task(struct rq *rq, struct rq_flags *rf,
> > +			    struct task_struct *p, int dest_cpu, unsigned int flags)
> > +{
> > +	struct set_affinity_pending my_pending = { }, *pending = NULL;
> > +	struct migration_arg arg = {
> > +		.task = p,
> > +		.dest_cpu = dest_cpu,
> > +	};
> > +	bool complete = false;
> > +
> > +	/* Can the task run on the task's current CPU? If so, we're done */
> > +	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
> > +		pending = p->migration_pending;
> > +		if (pending) {
> > +			p->migration_pending = NULL;
> > +			complete = true;
> 
> Deciphering my TLA+ deadlock traces leads me to think this needs
> 
>                         refcount_inc(&pending->refs);
> 
> because the 'goto do_complete' leads us to an unconditional decrement.

Hurmm. I think you're right. I've updated the patch.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
       [not found]   ` <BN8PR12MB2978A3BB8062DF4CEEB184109A030@BN8PR12MB2978.namprd12.prod.outlook.com>
@ 2020-10-16  9:35     ` Peter Zijlstra
  0 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-16  9:35 UTC (permalink / raw)
  To: zhout, g
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

On Fri, Oct 16, 2020 at 11:43:17AM +0800, zhout wrote:
> On Thu, Oct 15, 2020 at 01:05:43PM +0200, Peter Zijlstra wrote:
> > From: Thomas Gleixner <tglx@linutronix.de>
> > 
> 
> [ ... ]  
> 
> > +static inline bool rq_has_pinned_tasks(struct rq *rq)
> > +{
> > +	return rq->nr_pinned;
> > +}
> > +
> 
> Even I do not understand well enough, the return type of the
> above function is bool and rq->nr_pinned is unsigned int.
> Write this way:
> 
> static inline bool rq_has_pinned_tasks(struct rq *rq)
> {
> 	return !!rq->nr_pinned;
> }

The C compiler does this for us. C has a lot of implicit type
conversions.

It is the same mechanism at work when we write:

	if (rq->nr_pinned)

We know that works and we don't have to write:

	if (!!rq->nr_pinned)

(although we may, but it is pointless).


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-15 11:05 ` [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
  2020-10-15 13:54   ` Valentin Schneider
@ 2020-10-16 12:48   ` Valentin Schneider
       [not found]     ` <BN8PR12MB29784D239007D0D6CA3F4F2A9A010@BN8PR12MB2978.namprd12.prod.outlook.com>
  1 sibling, 1 reply; 27+ messages in thread
From: Valentin Schneider @ 2020-10-16 12:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 15/10/20 12:05, Peter Zijlstra wrote:
> @@ -1862,15 +1875,27 @@ static int migration_cpu_stop(void *data
>        * we're holding p->pi_lock.
>        */
>       if (task_rq(p) == rq) {
> +		if (is_migration_disabled(p))
> +			goto out;
> +
>               if (task_on_rq_queued(p))
>                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
>               else
>                       p->wake_cpu = arg->dest_cpu;
> +
> +		if (arg->done) {
> +			p->migration_pending = NULL;
> +			complete = true;

Ok so nasty ahead:

P0@CPU0             P1                    P2                      stopper

migrate_disable();
                   sca(P0, {CPU1});
                     <installs pending>
migrate_enable();
  <kicks stopper>
                                          sca(P0, {CPU0});
                                             <locks>
                                             <local, has pending:
                                              goto do_complete>
                                             <unlocks>
                                             complete_all();
                                             refcount_dec();
                     refcount_dec();
                   <done>
                                           <done>

                                                                  <locks>
                                                                  <fiddles with pending->arg->done>

First, P2 can clear p->migration_pending before the stopper gets to run.

Second, the complete_all() is done without pi / rq locks held, but P2 might
get to it before the stopper does. This may cause &pending to be popped off
the stack before the stopper gets to it, so mayhaps we would need the below
hunk.

The move_queued_task() from the stopper is "safe" in that we won't kick a
task outside of its allowed mask, although we may move it around for no
reason - I tried to prevent that.

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5b6eac07adb..1ebf653c2c2f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1859,6 +1859,13 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 	return rq;
 }
 
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
+};
+
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
@@ -1866,6 +1873,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
  */
 static int migration_cpu_stop(void *data)
 {
+	struct set_affinity_pending *pending;
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
@@ -1886,13 +1894,22 @@ static int migration_cpu_stop(void *data)
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
+
+	if (arg->done)
+		pending = container_of(arg->done, struct set_affinity_pending, done);
 	/*
 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
-		if (is_migration_disabled(p))
+		/*
+		 * An affinity update may have raced with us.
+		 * p->migration_pending could now be NULL, or could be pointing
+		 * elsewhere entirely.
+		 */
+		if (is_migration_disabled(p) ||
+		    (arg->done && p->migration_pending != pending))
 			goto out;
 
 		if (task_on_rq_queued(p))
@@ -2024,13 +2041,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	__do_set_cpus_allowed(p, new_mask, 0);
 }
 
-struct set_affinity_pending {
-	refcount_t		refs;
-	struct completion	done;
-	struct cpu_stop_work	stop_work;
-	struct migration_arg	arg;
-};
-
 /*
  * This function is wildly self concurrent; here be dragons.
  *

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
       [not found]     ` <BN8PR12MB29784D239007D0D6CA3F4F2A9A010@BN8PR12MB2978.namprd12.prod.outlook.com>
@ 2020-10-18 15:51       ` Valentin Schneider
       [not found]         ` <BN8PR12MB2978F76887133CCA2102B7589A1E0@BN8PR12MB2978.namprd12.prod.outlook.com>
  0 siblings, 1 reply; 27+ messages in thread
From: Valentin Schneider @ 2020-10-18 15:51 UTC (permalink / raw)
  To: ouwen210
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj


Hi,

On 18/10/20 10:46, ouwen wrote:
> On Fri, Oct 16, 2020 at 01:48:17PM +0100, Valentin Schneider wrote:
>> ---
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a5b6eac07adb..1ebf653c2c2f 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -1859,6 +1859,13 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
>>  	return rq;
>>  }
>>  
>> +struct set_affinity_pending {
>> +	refcount_t		refs;
>> +	struct completion	done;
>> +	struct cpu_stop_work	stop_work;
>> +	struct migration_arg	arg;
>> +};
>> +
>>  /*
>>   * migration_cpu_stop - this will be executed by a highprio stopper thread
>>   * and performs thread migration by bumping thread off CPU then
>> @@ -1866,6 +1873,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
>>   */
>>  static int migration_cpu_stop(void *data)
>>  {
>> +	struct set_affinity_pending *pending;
>>  	struct migration_arg *arg = data;
>>  	struct task_struct *p = arg->task;
>>  	struct rq *rq = this_rq();
>> @@ -1886,13 +1894,22 @@ static int migration_cpu_stop(void *data)
>>  
>>  	raw_spin_lock(&p->pi_lock);
>>  	rq_lock(rq, &rf);
>> +
>> +	if (arg->done)
>
> If I'm not wrong(always likely), arg->done is point to the installed
> pending's done of the first task that calling sca. It should not be
> NULL because it is a pointer to the stack address not related to the
> content in the stack.
>

Correct; here I'm using it as an indicator of whether migration_cpu_stop()
was invoked by SCA with a pending affinity request. I'll admit it's icky,
I'd prefer having an explicit flag to check against.

>> +		pending = container_of(arg->done, struct set_affinity_pending, done);
>>  	/*
>>  	 * If task_rq(p) != rq, it cannot be migrated here, because we're
>>  	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
>>  	 * we're holding p->pi_lock.
>>  	 */
>>  	if (task_rq(p) == rq) {
>> -		if (is_migration_disabled(p))
>> +		/*
>> +		 * An affinity update may have raced with us.
>> +		 * p->migration_pending could now be NULL, or could be pointing
>> +		 * elsewhere entirely.
>> +		 */
>> +		if (is_migration_disabled(p) ||
>> +		    (arg->done && p->migration_pending != pending))
>                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> p->migration_pending can be set on the random task's stack but the
> address is possible to be the same with the previous pending. It's
> very very unlikely. But I'm also totally failed.
>

Do you mean if we encounter the above race, but on top of that a new
pending gets installed that has the *same* address as the previous one?

That would mean that the task which installed that first pending got out of
affine_move_task() and *back into it*, with the same stack depth, before the
stopper got to run & grab the task_rq_lock. I also thought about this, but
am unsure how far to push the paranoia.


Side thought: don't we need to NULL p->migration_pending in __sched_fork()?

> I can't realize anything that time, but now I just give this noise.
> Use refcount_add/dec on MIGRATE_ENABLE path to prevent that not sure
> yet.
>

One annoying thing is that in that path we can't wait on the refcount
reaching 0, since migrate_{disable, enable}() disable preemption.
(the stopper is only schedule()'d upon reenabling preemption in
migrate_enable()).

Including the stopper callback in the refcount chain would probably reduce
future headaches, but it's not as straightforward.


>>  			goto out;
>>  
>>  		if (task_on_rq_queued(p))
>> @@ -2024,13 +2041,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
>>  	__do_set_cpus_allowed(p, new_mask, 0);
>>  }
>>  
>> -struct set_affinity_pending {
>> -	refcount_t		refs;
>> -	struct completion	done;
>> -	struct cpu_stop_work	stop_work;
>> -	struct migration_arg	arg;
>> -};
>> -
>>  /*
>>   * This function is wildly self concurrent; here be dragons.
>>   *


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
       [not found]         ` <BN8PR12MB2978F76887133CCA2102B7589A1E0@BN8PR12MB2978.namprd12.prod.outlook.com>
@ 2020-10-19 17:36           ` Valentin Schneider
       [not found]             ` <BN8PR12MB2978D36FF4A81C344DCF37FD9A1F0@BN8PR12MB2978.namprd12.prod.outlook.com>
  0 siblings, 1 reply; 27+ messages in thread
From: Valentin Schneider @ 2020-10-19 17:36 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj


On 19/10/20 17:32, Tao Zhou wrote:
> Hi Valentin,
>>
>> Side thought: don't we need to NULL p->migration_pending in __sched_fork()?
>>
>
> No need, if fork happen, the forked task will inherit that pending.

Which is what I'm worrying about. I think we can ignore migrate_disable()
for now and just consider a task fork()'ing while another tries to
sched_setaffinity() it; say the fork() happens just after the pending has
been installed.

Unless I'm overthinking it, we could end up with a dangling pending
reference, which will wreck havock to any later affine_move_task() (see the
if (p->migration_pending) paths).

> If pending is NULL, when call SCA, it will install pending. If pending
> is not NULL, it means it is in migrate region and wait for complete.
> If NULL pending in __sched_fork() it will install a new pending in this
> case which is not right to me. Add a flag(ENABLE_MIGRATE) to arg will
> differ the path where in.
>
> Always likely miss something.
>
> Tao
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
       [not found]             ` <BN8PR12MB2978D36FF4A81C344DCF37FD9A1F0@BN8PR12MB2978.namprd12.prod.outlook.com>
@ 2020-10-20  7:56               ` Peter Zijlstra
  0 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2020-10-20  7:56 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Valentin Schneider, tglx, mingo, linux-kernel, bigeasy,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj


FYI Tao, I shared this with Valentin on IRC yesterday evening (we're
both in Europe):

  https://paste.debian.net/1167885/

I'll be going over it again this morning with a (hopefully) fresh(er) mind.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2020-10-20  7:56 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-15 11:05 [PATCH v3 00/19] sched: Migrate disable support Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 02/19] sched: Fix balance_callback() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 05/19] workqueue: Manually break affinity " Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 09/19] sched: Add migrate_disable() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
2020-10-15 13:54   ` Valentin Schneider
2020-10-15 14:19     ` Peter Zijlstra
2020-10-16 12:48   ` Valentin Schneider
     [not found]     ` <BN8PR12MB29784D239007D0D6CA3F4F2A9A010@BN8PR12MB2978.namprd12.prod.outlook.com>
2020-10-18 15:51       ` Valentin Schneider
     [not found]         ` <BN8PR12MB2978F76887133CCA2102B7589A1E0@BN8PR12MB2978.namprd12.prod.outlook.com>
2020-10-19 17:36           ` Valentin Schneider
     [not found]             ` <BN8PR12MB2978D36FF4A81C344DCF37FD9A1F0@BN8PR12MB2978.namprd12.prod.outlook.com>
2020-10-20  7:56               ` Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
     [not found]   ` <BN8PR12MB2978A3BB8062DF4CEEB184109A030@BN8PR12MB2978.namprd12.prod.outlook.com>
2020-10-16  9:35     ` Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
2020-10-15 11:05 ` [PATCH v3 19/19] sched: Comment affine_move_task() Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.