linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -v2 00/17] sched: Migrate disable support
@ 2020-10-05 14:57 Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 01/17] stop_machine: Add function and caller debug info Peter Zijlstra
                   ` (18 more replies)
  0 siblings, 19 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Hi,

Seconds version of migrate_disable(); it has a bunch of bugs fixed and has
grown rt/dl balancer changes to push away tasks that inhibit the migration of
migrate_disable() tasks.

I still don't like it much, and it very much hurts my brain, but it seems to have
stopped crashing in weird and wonderful ways.

Tested on PREEMPT_RT.

---
 fs/proc/array.c               |   4 +-
 include/linux/cpuhotplug.h    |   1 +
 include/linux/cpumask.h       |   6 +
 include/linux/preempt.h       |  64 ++++
 include/linux/sched.h         |   5 +
 include/linux/sched/hotplug.h |   2 +
 include/linux/stop_machine.h  |   5 +
 include/trace/events/sched.h  |  12 +
 kernel/cpu.c                  |   9 +-
 kernel/sched/core.c           | 764 ++++++++++++++++++++++++++++++++----------
 kernel/sched/cpudeadline.c    |   4 +-
 kernel/sched/cpupri.c         |   4 +-
 kernel/sched/deadline.c       |  43 ++-
 kernel/sched/rt.c             |  84 +++--
 kernel/sched/sched.h          |  59 +++-
 kernel/stop_machine.c         |  23 +-
 kernel/workqueue.c            |   4 +
 lib/cpumask.c                 |  18 +
 lib/dump_stack.c              |   2 +
 lib/smp_processor_id.c        |   5 +
 20 files changed, 895 insertions(+), 223 deletions(-)



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 01/17] stop_machine: Add function and caller debug info
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 02/17] sched: Fix balance_callback() Peter Zijlstra
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/stop_machine.h |    5 +++++
 kernel/stop_machine.c        |   23 ++++++++++++++++++++---
 lib/dump_stack.c             |    2 ++
 3 files changed, 27 insertions(+), 3 deletions(-)

--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(u
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,23 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper);
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +135,7 @@ static bool cpu_stop_queue_work(unsigned
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +343,8 @@ int stop_two_cpus(unsigned int cpu1, uns
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +380,7 @@ int stop_two_cpus(unsigned int cpu1, uns
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +500,8 @@ static void cpu_stopper_thread(unsigned
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +510,8 @@ static void cpu_stopper_thread(unsigned
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *l
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 02/17] sched: Fix balance_callback()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 01/17] stop_machine: Add function and caller debug info Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
                   ` (16 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  119 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    3 +
 2 files changed, 78 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3494,6 +3494,69 @@ static inline void finish_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3519,6 +3582,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3660,43 +3724,6 @@ static struct rq *finish_task_switch(str
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3716,7 +3743,6 @@ asmlinkage __visible void schedule_tail(
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4532,10 +4558,11 @@ static void __sched notrace __schedule(b
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4946,9 +4973,11 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5222,6 +5251,7 @@ static int __sched_setscheduler(struct t
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5460,6 +5490,7 @@ static int __sched_setscheduler(struct t
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5468,7 +5499,7 @@ static int __sched_setscheduler(struct t
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
 #endif
+#ifdef CONFIG_SMP
+	SCHED_WARN_ON(rq->balance_callback);
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 01/17] stop_machine: Add function and caller debug info Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 02/17] sched: Fix balance_callback() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06  7:25   ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 04/17] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
                   ` (15 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  118 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    7 ++-
 2 files changed, 122 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3535,6 +3537,22 @@ static inline void balance_callbacks(str
 	}
 }
 
+static bool balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (unlikely(rq->balance_flags)) {
+		/*
+		 * Run the balance_callbacks, except on hotplug
+		 * when we need to push the current task away.
+		 */
+		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+		    !(rq->balance_flags & BALANCE_PUSH) ||
+		    !balance_push(rq))
+			__balance_callbacks(rq);
+	}
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3550,6 +3568,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3577,7 +3599,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6836,6 +6858,93 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return false;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+
+	return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+	return false;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6921,6 +7030,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6968,6 +7079,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7094,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1385,6 +1386,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq,
 {
 	lockdep_assert_held(&rq->lock);
 
-	if (unlikely(head->next))
+	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
 		return;
 
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 04/17] sched/core: Wait for tasks being pushed away on hotplug
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 05/17] workqueue: Manually break affinity " Peter Zijlstra
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

From: Thomas Gleixner <tglx@linutronix.de>

RT kernels need to ensure that all tasks which are not per CPU kthreads
have left the outgoing CPU to guarantee that no tasks are force migrated
within a migrate disabled section.

There is also some desire to (ab)use fine grained CPU hotplug control to
clear a CPU from active state to force migrate tasks which are not per CPU
kthreads away for power control purposes.

Add a mechanism which waits until all tasks which should leave the CPU
after the CPU active flag is cleared have moved to a different online CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   40 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    4 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6900,8 +6900,21 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task))
+	if (is_per_cpu_kthread(push_task)) {
+		/*
+		 * If this is the idle task on the outgoing CPU try to wake
+		 * up the hotplug control thread which might wait for the
+		 * last task to vanish. The rcuwait_active() check is
+		 * accurate here because the waiter is pinned on this CPU
+		 * and can't obviously be running in parallel.
+		 */
+		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+			raw_spin_unlock(&rq->lock);
+			rcuwait_wake_up(&rq->hotplug_wait);
+			raw_spin_lock(&rq->lock);
+		}
 		return false;
+	}
 
 	get_task_struct(push_task);
 	/*
@@ -6934,6 +6947,20 @@ static void balance_push_set(int cpu, bo
 	rq_unlock_irqrestore(rq, &rf);
 }
 
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+	struct rq *rq = this_rq();
+
+	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+			   TASK_UNINTERRUPTIBLE);
+}
+
 #else
 
 static inline bool balance_push(struct rq *rq)
@@ -6945,6 +6972,10 @@ static void balance_push_set(int cpu, bo
 {
 }
 
+static inline void balance_hotplug_wait(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7099,6 +7130,10 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+
+	/* Wait for all non per CPU kernel threads to vanish. */
+	balance_hotplug_wait();
+
 	return 0;
 }
 
@@ -7339,6 +7374,9 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		rcuwait_init(&rq->hotplug_wait);
+#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,6 +1004,10 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct rcuwait		hotplug_wait;
+#endif
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 05/17] workqueue: Manually break affinity on hotplug
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 04/17] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 16:31   ` Tejun Heo
  2020-10-05 14:57 ` [PATCH -v2 06/17] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
                   ` (13 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Don't rely on the scheduler to force break affinity for us -- it will
stop doing that for per-cpu-kthreads.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/workqueue.c |    4 ++++
 1 file changed, 4 insertions(+)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4905,6 +4905,10 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		raw_spin_unlock_irq(&pool->lock);
+
+		for_each_pool_worker(worker, pool)
+			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
+
 		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 06/17] sched/hotplug: Consolidate task migration on CPU unplug
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 05/17] workqueue: Manually break affinity " Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
                   ` (12 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

From: Thomas Gleixner <tglx@linutronix.de>

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpuhotplug.h    |    1 
 include/linux/sched/hotplug.h |    2 
 kernel/cpu.c                  |    9 ++
 kernel/sched/core.c           |  154 +++++++++---------------------------------
 4 files changed, 46 insertions(+), 120 deletions(-)

--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -151,6 +151,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned i
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6759,120 +6759,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7128,10 +7014,6 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
-
-	/* Wait for all non per CPU kernel threads to vanish. */
-	balance_hotplug_wait();
-
 	return 0;
 }
 
@@ -7151,6 +7033,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7164,7 +7081,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 06/17] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06 12:46   ` Vincent Guittot
  2020-10-09 20:41   ` Dietmar Eggemann
  2020-10-05 14:57 ` [PATCH -v2 08/17] sched: Massage set_cpus_allowed() Peter Zijlstra
                   ` (11 subsequent siblings)
  18 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Since we now migrate tasks away before DYING, we should also move
bandwidth unthrottle, otherwise we can gain tasks from unthrottle
after we expect all tasks to be gone already.

Also; it looks like the RT balancers don't respect cpu_active() and
instead rely on rq->online in part, complete this. This too requires
we do set_rq_offline() earlier to match the cpu_active() semantics.
(The bigger patch is to convert RT to cpu_active() entirely)

Since set_rq_online() is called from sched_cpu_activate(), place
set_rq_offline() in sched_cpu_deactivate().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   14 ++++++++++----
 kernel/sched/deadline.c |    5 +----
 kernel/sched/rt.c       |    5 +----
 3 files changed, 12 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6979,6 +6979,8 @@ int sched_cpu_activate(unsigned int cpu)
 
 int sched_cpu_deactivate(unsigned int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
 	int ret;
 
 	set_cpu_active(cpu, false);
@@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cp
 
 	balance_push_set(cpu, true);
 
+	rq_lock_irqsave(rq, &rf);
+	if (rq->rd) {
+		update_rq_clock();
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -7074,10 +7084,6 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq);
 
 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 {
-	return dl_task(prev);
+	return rq->online && dl_task(prev);
 }
 
 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
@@ -2326,9 +2326,6 @@ static void rq_online_dl(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_dl(struct rq *rq)
 {
-	if (rq->dl.overloaded)
-		dl_clear_overload(rq);
-
 	cpudl_clear(&rq->rd->cpudl, rq->cpu);
 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	return rq->online && rq->rt.highest_prio.curr > prev->prio;
 }
 
 static inline int rt_overloaded(struct rq *rq)
@@ -2245,9 +2245,6 @@ static void rq_online_rt(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_rt(struct rq *rq)
 {
-	if (rq->rt.overloaded)
-		rt_clear_overload(rq);
-
 	__disable_runtime(rq);
 
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 08/17] sched: Massage set_cpus_allowed()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06 11:20   ` Valentin Schneider
  2020-10-05 14:57 ` [PATCH -v2 09/17] sched: Add migrate_disable() Peter Zijlstra
                   ` (10 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Thread a u32 flags word through the *set_cpus_allowed*() callchain.
This will allow adding behavioural tweaks for future users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   28 ++++++++++++++++++----------
 kernel/sched/deadline.c |    5 +++--
 kernel/sched/sched.h    |    7 +++++--
 3 files changed, 26 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1828,13 +1828,14 @@ static int migration_cpu_stop(void *data
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -1855,7 +1856,7 @@ void do_set_cpus_allowed(struct task_str
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask);
+	p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1863,6 +1864,11 @@ void do_set_cpus_allowed(struct task_str
 		set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1873,7 +1879,8 @@ void do_set_cpus_allowed(struct task_str
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
+				  const struct cpumask *new_mask,
+				  u32 flags)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
@@ -1895,7 +1902,7 @@ static int __set_cpus_allowed_ptr(struct
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1914,7 +1921,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	do_set_cpus_allowed(p, new_mask);
+	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -1951,7 +1958,7 @@ static int __set_cpus_allowed_ptr(struct
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, false);
+	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2410,7 +2417,8 @@ void sched_set_stop_task(int cpu, struct
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
+					 const struct cpumask *new_mask,
+					 u32 flags)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -6025,7 +6033,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -6608,7 +6616,7 @@ void init_idle(struct task_struct *idle,
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2280,7 +2280,8 @@ static void task_woken_dl(struct rq *rq,
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
-				const struct cpumask *new_mask)
+				const struct cpumask *new_mask,
+				u32 flags)
 {
 	struct root_domain *src_rd;
 	struct rq *rq;
@@ -2309,7 +2310,7 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	set_cpus_allowed_common(p, new_mask);
+	set_cpus_allowed_common(p, new_mask, flags);
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1806,7 +1806,8 @@ struct sched_class {
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
+				 const struct cpumask *newmask,
+				 u32 flags);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#define SCA_CHECK		0x01
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 09/17] sched: Add migrate_disable()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 08/17] sched: Massage set_cpus_allowed() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06 11:20   ` Valentin Schneider
  2020-10-05 14:57 ` [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
                   ` (9 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   60 +++++++++++++++++++++++++
 include/linux/sched.h   |    3 +
 kernel/sched/core.c     |  112 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |    6 +-
 lib/smp_processor_id.c  |    5 ++
 5 files changed, 178 insertions(+), 8 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,64 @@ static inline void preempt_notifier_init
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#else /* !(CONFIG_SMP && CONFIG_PREEMPT_RT) */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +410,6 @@ static __always_inline void migrate_enab
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1700,6 +1700,61 @@ void check_preempt_curr(struct rq *rq, s
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+	if (--p->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1709,7 +1764,7 @@ static inline bool is_cpu_allowed(struct
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1830,6 +1885,11 @@ static int migration_cpu_stop(void *data
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+		p->cpus_ptr = new_mask;
+		return;
+	}
+
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1840,7 +1900,22 @@ __do_set_cpus_allowed(struct task_struct
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1891,9 +1966,14 @@ static int __set_cpus_allowed_ptr(struct
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_any_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1907,7 +1987,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1999,6 +2079,8 @@ void set_task_cpu(struct task_struct *p,
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2329,6 +2411,12 @@ static int select_fallback_rq(int cpu, s
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -2363,7 +2451,7 @@ int select_task_rq(struct task_struct *p
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -2425,6 +2513,17 @@ static inline int __set_cpus_allowed_ptr
 
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -4589,6 +4688,7 @@ static void __sched notrace __schedule(b
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1895,14 +1895,16 @@ static inline bool sched_fair_runnable(s
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-#define SCA_CHECK		0x01
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(c
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 09/17] sched: Add migrate_disable() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
       [not found]   ` <CH2PR14MB41833F828B4D3BA5A7B6CE7B9A0B0@CH2PR14MB4183.namprd14.prod.outlook.com>
  2020-10-05 14:57 ` [PATCH -v2 11/17] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
                   ` (8 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    1 
 kernel/sched/core.c   |  161 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 139 insertions(+), 23 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1731,15 +1731,26 @@ void migrate_enable(void)
 {
 	struct task_struct *p = current;
 
-	if (--p->migration_disabled)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
 	barrier();
-
-	if (p->cpus_ptr == &p->cpus_mask)
-		return;
-
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1806,6 +1817,7 @@ static struct rq *move_queued_task(struc
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
+	struct completion *done;
 };
 
 /*
@@ -1840,6 +1852,7 @@ static int migration_cpu_stop(void *data
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
@@ -1862,15 +1875,27 @@ static int migration_cpu_stop(void *data
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
 		if (task_on_rq_queued(p))
 			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
 		else
 			p->wake_cpu = arg->dest_cpu;
+
+		if (arg->done) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
 	}
+out:
 	rq_unlock(rq, &rf);
 	raw_spin_unlock(&p->pi_lock);
-
 	local_irq_enable();
+
+	if (complete)
+		complete_all(arg->done);
+
 	return 0;
 }
 
@@ -1939,6 +1964,111 @@ void do_set_cpus_allowed(struct task_str
 	__do_set_cpus_allowed(p, new_mask, 0);
 }
 
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
+};
+
+/*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct rq_flags *rf,
+			    struct task_struct *p, int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending = { }, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+	bool complete = false;
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		pending = p->migration_pending;
+		if (pending) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			goto do_complete;
+
+		return 0;
+	}
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending))
+		return -EINVAL;
+
+	arg.done = &pending->done;
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		task_rq_unlock(rq, p, rf);
+		pending->arg = arg;
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+do_complete:
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -2008,23 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
+	return affine_move_task(rq, &rf, p, dest_cpu, flags);
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
 out:
 	task_rq_unlock(rq, p, &rf);
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 11/17] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (9 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
                   ` (7 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

From: Thomas Gleixner <tglx@linutronix.de>

On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   36 ++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |    4 ++++
 2 files changed, 34 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1725,10 +1725,17 @@ static void migrate_disable_switch(struc
 
 void migrate_disable(void)
 {
-	if (current->migration_disabled++)
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+		p->migration_disabled++;
 		return;
+	}
 
-	barrier();
+	preempt_disable();
+	this_rq()->nr_pinned++;
+	p->migration_disabled = 1;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -1755,6 +1762,7 @@ void migrate_enable(void)
 	 */
 	barrier();
 	p->migration_disabled = 0;
+	this_rq()->nr_pinned--;
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1764,6 +1772,11 @@ static inline bool is_migration_disabled
 	return p->migration_disabled;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return rq->nr_pinned;
+}
+
 #endif
 
 /*
@@ -2634,6 +2647,11 @@ static inline bool is_migration_disabled
 	return false;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return false;
+}
+
 #endif
 
 static void
@@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task)) {
+	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the
 		 * last task to vanish. The rcuwait_active() check is
 		 * accurate here because the waiter is pinned on this CPU
 		 * and can't obviously be running in parallel.
+		 *
+		 * On RT kernels this also has to check whether there are
+		 * pinned and scheduled out tasks on the runqueue. They
+		 * need to leave the migrate disabled section first.
 		 */
-		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+		    rcuwait_active(&rq->hotplug_wait)) {
 			raw_spin_unlock(&rq->lock);
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
@@ -7063,7 +7086,8 @@ static void balance_hotplug_wait(void)
 {
 	struct rq *rq = this_rq();
 
-	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+	rcuwait_wait_event(&rq->hotplug_wait,
+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
 			   TASK_UNINTERRUPTIBLE);
 }
 
@@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1);
+	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	unsigned int		nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (10 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 11/17] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06 14:09   ` Juri Lelli
  2020-10-06 15:55   ` Qais Yousef
  2020-10-05 14:57 ` [PATCH -v2 13/17] sched,rt: Use the full cpumask for balancing Peter Zijlstra
                   ` (6 subsequent siblings)
  18 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Replace a bunch of cpumask_any*() instances with
cpumask_any*_distribute(), by injecting this little bit of random in
cpu selection, we reduce the chance two competing balance operations
working off the same lowest_mask pick the same CPU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpumask.h |    6 ++++++
 kernel/sched/cpupri.c   |    4 ++--
 kernel/sched/deadline.c |    2 +-
 kernel/sched/rt.c       |    6 +++---
 lib/cpumask.c           |   18 ++++++++++++++++++
 5 files changed, 30 insertions(+), 6 deletions(-)

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distri
 	return cpumask_next_and(-1, src1p, src2p);
 }
 
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2001,7 +2001,7 @@ static int find_later_rq(struct task_str
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(later_mask);
+	cpu = cpumask_any_distribute(later_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_st
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(lowest_mask,
-						     sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(lowest_mask,
+							      sched_domain_span(sd));
 			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_st
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(lowest_mask);
+	cpu = cpumask_any_distribute(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const str
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
+
+int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next(prev, srcp);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first(srcp);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_distribute);



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 13/17] sched,rt: Use the full cpumask for balancing
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (11 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 14/17] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

We want migrate_disable() tasks to get PULLs in order for them to PUSH
away the higher priority task.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/cpudeadline.c |    4 ++--
 kernel/sched/cpupri.c      |    4 ++--
 kernel/sched/deadline.c    |    4 ++--
 kernel/sched/rt.c          |    4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
 		unsigned long cap, max_cap = 0;
 		int cpu, max_cpu = -1;
 
@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct
 
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+		if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 			if (later_mask)
 				cpumask_set_cpu(best_cpu, later_mask);
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct c
 	if (skip)
 		return 0;
 
-	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
 		return 0;
 
 	if (lowest_mask) {
-		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
 
 		/*
 		 * We have to ensure that we have at least one bit
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1888,7 +1888,7 @@ static void task_fork_dl(struct task_str
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 	return 0;
 }
@@ -2038,7 +2038,7 @@ static struct rq *find_lock_later_rq(str
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 
 	return 0;
@@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(st
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 14/17] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (12 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 13/17] sched,rt: Use the full cpumask for balancing Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
                   ` (4 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

There's a valid ->pi_lock recursion issue where the actual PI code
tries to wake up the stop task. Make lockdep aware so it doesn't
complain about this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   15 +++++++++++++++
 1 file changed, 15 insertions(+)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2602,6 +2602,7 @@ int select_task_rq(struct task_struct *p
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
+	static struct lock_class_key stop_pi_lock;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
 		stop->sched_class = &stop_sched_class;
+
+		/*
+		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+		 * adjust the effective priority of a task. As a result,
+		 * rt_mutex_setprio() can trigger (RT) balancing operations,
+		 * which can then trigger wakeups of the stop thread to push
+		 * around the current task.
+		 *
+		 * The stop task itself will never be part of the PI-chain, it
+		 * never blocks, therefore that ->pi_lock recursion is safe.
+		 * Tell lockdep about this by placing the stop->pi_lock in its
+		 * own class.
+		 */
+		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
 	}
 
 	cpu_rq(cpu)->stop = stop;



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (13 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 14/17] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-06  7:59   ` Peter Zijlstra
                     ` (4 more replies)
  2020-10-05 14:57 ` [PATCH -v2 16/17] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
                   ` (3 subsequent siblings)
  18 siblings, 5 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   38 +++++++++++++++------------
 include/linux/sched.h   |    3 +-
 kernel/sched/core.c     |   67 ++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c |   26 +++++++++++++-----
 kernel/sched/rt.c       |   63 ++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h    |   32 ++++++++++++++++++++++
 6 files changed, 182 insertions(+), 47 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 
 /*
- * Migrate-Disable and why it is (strongly) undesired.
+ * Migrate-Disable and why it is undesired.
  *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
  *
  *
  * The reason we have it anyway.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -715,8 +715,9 @@ struct task_struct {
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-	int				migration_disabled;
+	unsigned short			migration_disabled;
 #endif
+	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1767,11 +1767,6 @@ void migrate_enable(void)
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return p->migration_disabled;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return rq->nr_pinned;
@@ -1917,6 +1912,49 @@ static int migration_cpu_stop(void *data
 	return 0;
 }
 
+int push_cpu_stop(void *arg)
+{
+	struct rq *lowest_rq = NULL, *rq = this_rq();
+	struct task_struct *p = arg;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+
+	if (task_rq(p) != rq)
+		goto out_unlock;
+
+	if (is_migration_disabled(p)) {
+		p->migration_flags |= MDF_PUSH;
+		goto out_unlock;
+	}
+
+	p->migration_flags &= ~MDF_PUSH;
+
+	if (p->sched_class->find_lock_rq)
+		lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+	if (!lowest_rq)
+		goto out_unlock;
+
+	// XXX validate p is still the highest prio task
+	if (task_rq(p) == rq) {
+		deactivate_task(rq, p, 0);
+		set_task_cpu(p, lowest_rq->cpu);
+		activate_task(lowest_rq, p, 0);
+		resched_curr(lowest_rq);
+	}
+
+	double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+	rq->push_busy = false;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+	return 0;
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2001,6 +2039,14 @@ static int affine_move_task(struct rq *rq, stru
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		struct task_struct *push_task = NULL;
+
+		if ((flags & SCA_MIGRATE_ENABLE) &&
+		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+			rq->push_busy = true;
+			push_task = get_task_struct(p);
+		}
+
 		pending = p->migration_pending;
 		if (pending) {
 			p->migration_pending = NULL;
@@ -2008,6 +2054,11 @@ static int affine_move_task(struct rq *rq, stru
 		}
 		task_rq_unlock(rq, p, rf);
 
+		if (push_task) {
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    p, &rq->push_work);
+		}
+
 		if (complete)
 			goto do_complete;
 
@@ -2045,6 +2096,7 @@ static int affine_move_task(struct rq *rq, stru
 
 	if (flags & SCA_MIGRATE_ENABLE) {
 
+		p->migration_flags &= ~MDF_PUSH;
 		task_rq_unlock(rq, p, rf);
 		pending->arg = arg;
 		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
@@ -2657,11 +2709,6 @@ static inline int __set_cpus_allowed_ptr
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return false;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return false;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2182,7 +2182,7 @@ static void push_dl_tasks(struct rq *rq)
 static void pull_dl_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
@@ -2212,6 +2212,7 @@ static void pull_dl_task(struct rq *this
 			continue;
 
 		/* Might drop this_rq->lock */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2243,17 +2244,27 @@ static void pull_dl_task(struct rq *this
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
-			dmin = p->dl.deadline;
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				dmin = p->dl.deadline;
+				resched = true;
+			}
 
 			/* Is there any other task even earlier? */
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2497,6 +2508,7 @@ const struct sched_class dl_sched_class
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
+	.find_lock_rq		= find_lock_later_rq,
 #endif
 
 	.task_tick		= task_tick_dl,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task)) {
+		struct task_struct *push_task = NULL;
+		int cpu;
+
+		if (!pull || rq->push_busy)
+			return 0;
+
+		cpu = find_lowest_rq(rq->curr);
+		if (cpu == -1 || cpu == rq->cpu)
+			return 0;
+
+		/*
+		 * Given we found a CPU with lower priority than @next_task,
+		 * therefore it should be running. However we cannot migrate it
+		 * to this other CPU, instead attempt to push the current
+		 * running task on this CPU away.
+		 */
+		push_task = get_push_task(rq);
+		if (push_task) {
+			raw_spin_unlock(&rq->lock);
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    push_task, &rq->push_work);
+			raw_spin_lock(&rq->lock);
+		}
+
+		return 0;
+	}
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -1927,12 +1955,10 @@ static int push_rt_task(struct rq *rq)
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	ret = 1;
-
 	resched_curr(lowest_rq);
+	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
-
 out:
 	put_task_struct(next_task);
 
@@ -1942,7 +1968,7 @@ static int push_rt_task(struct rq *rq)
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
-	while (push_rt_task(rq))
+	while (push_rt_task(rq, false))
 		;
 }
 
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_w
 	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_tasks(rq);
+		while (push_rt_task(rq, true))
+			;
 		raw_spin_unlock(&rq->lock);
 	}
 
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this
 {
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	struct rq *src_rq;
 	int rt_overload_count = rt_overloaded(this_rq);
 
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this
 		 * double_lock_balance, and another CPU could
 		 * alter this_rq
 		 */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				resched = true;
+			}
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2446,6 +2484,7 @@ const struct sched_class rt_sched_class
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
+	.find_lock_rq		= find_lock_lowest_rq,
 #endif
 
 	.task_tick		= task_tick_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	unsigned int		nr_pinned;
 #endif
+	unsigned int		push_busy;
+	struct cpu_stop_work	push_work;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define MDF_PUSH	0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	return p->migration_disabled;
+#else
+	return false;
+#endif
+}
 
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
@@ -1816,6 +1828,8 @@ struct sched_class {
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1911,6 +1925,24 @@ extern void trigger_load_balance(struct
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->push_busy)
+		return NULL;
+
+	if (p->nr_cpus_allowed == 1)
+		return NULL;
+
+	rq->push_busy = true;
+	return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
 #endif
 
 #ifdef CONFIG_CPU_IDLE



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 16/17] sched/proc: Print accurate cpumask vs migrate_disable()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (14 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-05 14:57 ` [PATCH -v2 17/17] sched: Add migrate_disable() tracepoints Peter Zijlstra
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Ensure /proc/*/status doesn't print 'random' cpumasks due to
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/proc/array.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -382,9 +382,9 @@ static inline void task_context_switch_c
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH -v2 17/17] sched: Add migrate_disable() tracepoints
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (15 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 16/17] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
@ 2020-10-05 14:57 ` Peter Zijlstra
  2020-10-13 14:01 ` [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Valentin Schneider
  2020-10-13 14:01 ` [PATCH 2/2] sched: Comment affine_move_task() Valentin Schneider
  18 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-05 14:57 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

XXX write a tracer:

 - 'migirate_disable() -> migrate_enable()' time in task_sched_runtime()
 - 'migrate_pull -> sched-in' time in task_sched_runtime()

The first will give worst case for the second, which is the actual
interference experienced by the task to due migration constraints of
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/trace/events/sched.h |   12 ++++++++++++
 kernel/sched/core.c          |    4 ++++
 kernel/sched/deadline.c      |    1 +
 kernel/sched/rt.c            |    8 +++++++-
 4 files changed, 24 insertions(+), 1 deletion(-)

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -646,6 +646,18 @@ DECLARE_TRACE(sched_update_nr_running_tp
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_migrate_disable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_enable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_pull_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,6 +1732,8 @@ void migrate_disable(void)
 		return;
 	}
 
+	trace_sched_migrate_disable_tp(p);
+
 	preempt_disable();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
@@ -1764,6 +1766,8 @@ void migrate_enable(void)
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
 	preempt_enable();
+
+	trace_sched_migrate_enable_tp(p);
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2245,6 +2245,7 @@ static void pull_dl_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1877,7 +1877,12 @@ static int push_rt_task(struct rq *rq, b
 		struct task_struct *push_task = NULL;
 		int cpu;
 
-		if (!pull || rq->push_busy)
+		if (!pull)
+			return 0;
+
+		trace_sched_migrate_pull_tp(next_task);
+
+		if (rq->push_busy)
 			return 0;
 
 		cpu = find_lowest_rq(rq->curr);
@@ -2223,6 +2228,7 @@ static void pull_rt_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 05/17] workqueue: Manually break affinity on hotplug
  2020-10-05 14:57 ` [PATCH -v2 05/17] workqueue: Manually break affinity " Peter Zijlstra
@ 2020-10-05 16:31   ` Tejun Heo
  0 siblings, 0 replies; 58+ messages in thread
From: Tejun Heo @ 2020-10-05 16:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Mon, Oct 05, 2020 at 04:57:22PM +0200, Peter Zijlstra wrote:
> Don't rely on the scheduler to force break affinity for us -- it will
> stop doing that for per-cpu-kthreads.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-10-05 14:57 ` [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-10-06  7:25   ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06  7:25 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On Mon, Oct 05, 2020 at 04:57:20PM +0200, Peter Zijlstra wrote:
> +static inline void balance_switch(struct rq *rq)
> +{
> +	if (unlikely(rq->balance_flags)) {
> +		/*
> +		 * Run the balance_callbacks, except on hotplug
> +		 * when we need to push the current task away.
> +		 */
> +		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
> +		    !(rq->balance_flags & BALANCE_PUSH) ||
> +		    !balance_push(rq))
> +			__balance_callbacks(rq);
> +	}
> +}

> @@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq,
>  {
>  	lockdep_assert_held(&rq->lock);
>  
> -	if (unlikely(head->next))
> +	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
>  		return;

With this bit from Valentin we can probably simplify the above function,
but I've not thought about that yet.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
@ 2020-10-06  7:59   ` Peter Zijlstra
  2020-10-06 13:44     ` Steven Rostedt
  2020-10-06 11:20   ` Valentin Schneider
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06  7:59 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On Mon, Oct 05, 2020 at 04:57:32PM +0200, Peter Zijlstra wrote:
> +static inline struct task_struct *get_push_task(struct rq *rq)
> +{
> +	struct task_struct *p = rq->curr;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (rq->push_busy)
> +		return NULL;
> +
> +	if (p->nr_cpus_allowed == 1)
> +		return NULL;

This; that means what when we're stuck below a per-cpu thread, we're
toast. There's just nothing much you can do... :/

> +
> +	rq->push_busy = true;
> +	return get_task_struct(p);
> +}

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 08/17] sched: Massage set_cpus_allowed()
  2020-10-05 14:57 ` [PATCH -v2 08/17] sched: Massage set_cpus_allowed() Peter Zijlstra
@ 2020-10-06 11:20   ` Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-06 11:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 05/10/20 15:57, Peter Zijlstra wrote:
> Thread a u32 flags word through the *set_cpus_allowed*() callchain.
> This will allow adding behavioural tweaks for future users.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
[...]
> @@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
>
>  extern void trigger_load_balance(struct rq *rq);
>
> -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
> +#define SCA_CHECK		0x01
> +

That most likely needs to live outside of the #define CONFIG_SMP, so this
should be hoisted up - you already do that in the next patch, this should
be done here.

> +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
>
>  #endif
>

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 09/17] sched: Add migrate_disable()
  2020-10-05 14:57 ` [PATCH -v2 09/17] sched: Add migrate_disable() Peter Zijlstra
@ 2020-10-06 11:20   ` Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-06 11:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 05/10/20 15:57, Peter Zijlstra wrote:
> Add the base migrate_disable() support (under protest).
>
> While migrate_disable() is (currently) required for PREEMPT_RT, it is
> also one of the biggest flaws in the system.
>
> Notably this is just the base implementation, it is broken vs
> sched_setaffinity() and hotplug, both solved in additional patches for
> ease of review.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
[...]
> @@ -2363,7 +2451,7 @@ int select_task_rq(struct task_struct *p
>  {
>       lockdep_assert_held(&p->pi_lock);
>
> -	if (p->nr_cpus_allowed > 1)
> +	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))

So dissociating migrate_disable() from p->nr_cpus_allowed unlocks the whole
DL+RT push/pull thingie, but it also means we need to be somewhat careful
regarding checks like the above.

Looking at current p->nr_cpus_allowed readers, it's mostly DL/RT; I was
somewhat afraid CFS load balance would use it but turns out it doesn't. The
only borderline case I see is call_on_cpu(), and even then it would be just a
performance deal rather than a correctness one.

All that to say: it seems fine to me.

>               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
>       else
>               cpu = cpumask_any(p->cpus_ptr);

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
  2020-10-06  7:59   ` Peter Zijlstra
@ 2020-10-06 11:20   ` Valentin Schneider
  2020-10-06 13:48     ` Peter Zijlstra
  2020-10-07 19:22   ` Valentin Schneider
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 58+ messages in thread
From: Valentin Schneider @ 2020-10-06 11:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 05/10/20 15:57, Peter Zijlstra wrote:
> In order to minimize the interference of migrate_disable() on lower
> priority tasks, which can be deprived of runtime due to being stuck
> below a higher priority task. Teach the RT/DL balancers to push away
> these higher priority tasks when a lower priority task gets selected
> to run on a freshly demoted CPU (pull).
>
> This adds migration interference to the higher priority task, but
> restores bandwidth to system that would otherwise be irrevocably lost.
> Without this it would be possible to have all tasks on the system
> stuck on a single CPU, each task preempted in a migrate_disable()
> section with a single high priority task running.
>
> This way we can still approximate running the M highest priority tasks
> on the system.
>

Ah, so IIUC that's the important bit that makes it we can't just say go
through the pushable_tasks list and skip migrate_disable() tasks.

Once the highest-prio task exits its migrate_disable() region, your patch
pushes it away. If we ended up with a single busy CPU, it'll spread the
tasks around one migrate_enable() at a time.

That time where the top task is migrate_disable() is still a crappy time,
and as you pointed out earlier today if it is a genuine pcpu task then the
whole thing is -EBORKED...

An alternative I could see would be to prevent those piles from forming
altogether, say by issuing a similar push_cpu_stop() on migrate_disable()
if the next pushable task is already migrate_disable(); but that's a
proactive approach whereas yours is reactive, so I'm pretty sure that's
bound to perform worse.

> Migrating the top task away is (ofcourse) still subject to
> migrate_disable() too, which means the lower task is subject to an
> interference equivalent to the worst case migrate_disable() section.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-05 14:57 ` [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
@ 2020-10-06 12:46   ` Vincent Guittot
  2020-10-06 13:33     ` Peter Zijlstra
  2020-10-09 20:41   ` Dietmar Eggemann
  1 sibling, 1 reply; 58+ messages in thread
From: Vincent Guittot @ 2020-10-06 12:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel,
	Sebastian Andrzej Siewior, Qais Yousef, Scott Wood,
	Valentin Schneider, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
	Vincent Donnefort, Tejun Heo

On Mon, 5 Oct 2020 at 17:09, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Since we now migrate tasks away before DYING, we should also move
> bandwidth unthrottle, otherwise we can gain tasks from unthrottle
> after we expect all tasks to be gone already.
>
> Also; it looks like the RT balancers don't respect cpu_active() and
> instead rely on rq->online in part, complete this. This too requires
> we do set_rq_offline() earlier to match the cpu_active() semantics.
> (The bigger patch is to convert RT to cpu_active() entirely)
>
> Since set_rq_online() is called from sched_cpu_activate(), place
> set_rq_offline() in sched_cpu_deactivate().
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c     |   14 ++++++++++----
>  kernel/sched/deadline.c |    5 +----
>  kernel/sched/rt.c       |    5 +----
>  3 files changed, 12 insertions(+), 12 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6979,6 +6979,8 @@ int sched_cpu_activate(unsigned int cpu)
>
>  int sched_cpu_deactivate(unsigned int cpu)
>  {
> +       struct rq *rq = cpu_rq(cpu);
> +       struct rq_flags rf;
>         int ret;
>
>         set_cpu_active(cpu, false);
> @@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cp
>
>         balance_push_set(cpu, true);
>
> +       rq_lock_irqsave(rq, &rf);
> +       if (rq->rd) {
> +               update_rq_clock();

Tried to compile but rq parameter is missing

> +               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
> +               set_rq_offline(rq);
> +       }
> +       rq_unlock_irqrestore(rq, &rf);
> +
>  #ifdef CONFIG_SCHED_SMT
>         /*
>          * When going down, decrement the number of cores with SMT present.
> @@ -7074,10 +7084,6 @@ int sched_cpu_dying(unsigned int cpu)
>         sched_tick_stop(cpu);
>
>         rq_lock_irqsave(rq, &rf);
> -       if (rq->rd) {
> -               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
> -               set_rq_offline(rq);
> -       }
>         BUG_ON(rq->nr_running != 1);
>         rq_unlock_irqrestore(rq, &rf);
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq);
>
>  static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
>  {
> -       return dl_task(prev);
> +       return rq->online && dl_task(prev);
>  }
>
>  static DEFINE_PER_CPU(struct callback_head, dl_push_head);
> @@ -2326,9 +2326,6 @@ static void rq_online_dl(struct rq *rq)
>  /* Assumes rq->lock is held */
>  static void rq_offline_dl(struct rq *rq)
>  {
> -       if (rq->dl.overloaded)
> -               dl_clear_overload(rq);
> -
>         cpudl_clear(&rq->rd->cpudl, rq->cpu);
>         cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
>  }
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this
>  static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
>  {
>         /* Try to pull RT tasks here if we lower this rq's prio */
> -       return rq->rt.highest_prio.curr > prev->prio;
> +       return rq->online && rq->rt.highest_prio.curr > prev->prio;
>  }
>
>  static inline int rt_overloaded(struct rq *rq)
> @@ -2245,9 +2245,6 @@ static void rq_online_rt(struct rq *rq)
>  /* Assumes rq->lock is held */
>  static void rq_offline_rt(struct rq *rq)
>  {
> -       if (rq->rt.overloaded)
> -               rt_clear_overload(rq);
> -
>         __disable_runtime(rq);
>
>         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
>
>

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-06 12:46   ` Vincent Guittot
@ 2020-10-06 13:33     ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06 13:33 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel,
	Sebastian Andrzej Siewior, Qais Yousef, Scott Wood,
	Valentin Schneider, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
	Vincent Donnefort, Tejun Heo

On Tue, Oct 06, 2020 at 02:46:28PM +0200, Vincent Guittot wrote:

> > @@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cp
> >
> >         balance_push_set(cpu, true);
> >
> > +       rq_lock_irqsave(rq, &rf);
> > +       if (rq->rd) {
> > +               update_rq_clock();
> 
> Tried to compile but rq parameter is missing

Damn :/, I'm sure I fixed that. must've lost a refresh before sending.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06  7:59   ` Peter Zijlstra
@ 2020-10-06 13:44     ` Steven Rostedt
  2020-10-06 13:50       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Steven Rostedt @ 2020-10-06 13:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vincent.donnefort,
	tj

On Tue, 6 Oct 2020 09:59:39 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Oct 05, 2020 at 04:57:32PM +0200, Peter Zijlstra wrote:
> > +static inline struct task_struct *get_push_task(struct rq *rq)
> > +{
> > +	struct task_struct *p = rq->curr;
> > +
> > +	lockdep_assert_held(&rq->lock);
> > +
> > +	if (rq->push_busy)
> > +		return NULL;
> > +
> > +	if (p->nr_cpus_allowed == 1)
> > +		return NULL;  
> 
> This; that means what when we're stuck below a per-cpu thread, we're
> toast. There's just nothing much you can do... :/

Well, hopefully, per CPU threads don't run for long periods of time. I'm
working with folks having issues of running non stop RT threads that every
so often go into the kernel kicking off per CPU kernel threads that now get
starved when the RT tasks go back to user space, causing the rest of the
system to hang.

As I've always said. When dealing with real-time systems, you need to be
careful about how you organize your tasks. Ideally, any RT task that is
pinned to a CPU shouldn't be sharing that CPU with anything else that may
be critical.

-- Steve


> 
> > +
> > +	rq->push_busy = true;
> > +	return get_task_struct(p);
> > +}  


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 11:20   ` Valentin Schneider
@ 2020-10-06 13:48     ` Peter Zijlstra
  2020-10-06 14:37       ` Juri Lelli
  2020-10-06 16:19       ` Valentin Schneider
  0 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06 13:48 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On Tue, Oct 06, 2020 at 12:20:43PM +0100, Valentin Schneider wrote:
> 
> On 05/10/20 15:57, Peter Zijlstra wrote:
> > In order to minimize the interference of migrate_disable() on lower
> > priority tasks, which can be deprived of runtime due to being stuck
> > below a higher priority task. Teach the RT/DL balancers to push away
> > these higher priority tasks when a lower priority task gets selected
> > to run on a freshly demoted CPU (pull).
> >
> > This adds migration interference to the higher priority task, but
> > restores bandwidth to system that would otherwise be irrevocably lost.
> > Without this it would be possible to have all tasks on the system
> > stuck on a single CPU, each task preempted in a migrate_disable()
> > section with a single high priority task running.
> >
> > This way we can still approximate running the M highest priority tasks
> > on the system.
> >
> 
> Ah, so IIUC that's the important bit that makes it we can't just say go
> through the pushable_tasks list and skip migrate_disable() tasks.
> 
> Once the highest-prio task exits its migrate_disable() region, your patch
> pushes it away. If we ended up with a single busy CPU, it'll spread the
> tasks around one migrate_enable() at a time.
> 
> That time where the top task is migrate_disable() is still a crappy time,
> and as you pointed out earlier today if it is a genuine pcpu task then the
> whole thing is -EBORKED...
> 
> An alternative I could see would be to prevent those piles from forming
> altogether, say by issuing a similar push_cpu_stop() on migrate_disable()
> if the next pushable task is already migrate_disable(); but that's a
> proactive approach whereas yours is reactive, so I'm pretty sure that's
> bound to perform worse.

I think it is always possible to form pileups. Just start enough tasks
such that newer, higher priority, tasks have to preempt existing tasks.

Also, we might not be able to place the task elsewhere, suppose we have
all our M CPUs filled with an RT task, then when the lowest priority
task has migrate_disable(), wake the highest priority task.

Per the SMP invariant, this new highest priority task must preempt the
lowest priority task currently running, otherwise we would not be
running the M highest prio tasks.

That's not to say it might not still be beneficial from trying to avoid
them, but we must assume a pilup will occur, therefore my focus was on
dealing with them as best we can first.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 13:44     ` Steven Rostedt
@ 2020-10-06 13:50       ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06 13:50 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vincent.donnefort,
	tj

On Tue, Oct 06, 2020 at 09:44:45AM -0400, Steven Rostedt wrote:
> As I've always said. When dealing with real-time systems, you need to be
> careful about how you organize your tasks. Ideally, any RT task that is
> pinned to a CPU shouldn't be sharing that CPU with anything else that may
> be critical.

Due to locks we're always sharing.. :-)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute()
  2020-10-05 14:57 ` [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
@ 2020-10-06 14:09   ` Juri Lelli
  2020-10-06 14:20     ` Peter Zijlstra
  2020-10-06 15:55   ` Qais Yousef
  1 sibling, 1 reply; 58+ messages in thread
From: Juri Lelli @ 2020-10-06 14:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

Hi,

On 05/10/20 16:57, Peter Zijlstra wrote:
> Replace a bunch of cpumask_any*() instances with
> cpumask_any*_distribute(), by injecting this little bit of random in
> cpu selection, we reduce the chance two competing balance operations
> working off the same lowest_mask pick the same CPU.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/cpumask.h |    6 ++++++
>  kernel/sched/cpupri.c   |    4 ++--
>  kernel/sched/deadline.c |    2 +-
>  kernel/sched/rt.c       |    6 +++---
>  lib/cpumask.c           |   18 ++++++++++++++++++
>  5 files changed, 30 insertions(+), 6 deletions(-)
> 
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -199,6 +199,11 @@ static inline int cpumask_any_and_distri
>  	return cpumask_next_and(-1, src1p, src2p);
>  }
>  
> +static inline int cpumask_any_distribute(const struct cpumask *srcp)
> +{
> +	return cpumask_first(srcp);
> +}
> +
>  #define for_each_cpu(cpu, mask)			\
>  	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
>  #define for_each_cpu_not(cpu, mask)		\
> @@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask
>  unsigned int cpumask_local_spread(unsigned int i, int node);
>  int cpumask_any_and_distribute(const struct cpumask *src1p,
>  			       const struct cpumask *src2p);
> +int cpumask_any_distribute(const struct cpumask *srcp);
>  
>  /**
>   * for_each_cpu - iterate over every cpu in a mask
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2001,7 +2001,7 @@ static int find_later_rq(struct task_str
>  	if (this_cpu != -1)
>  		return this_cpu;
>  
> -	cpu = cpumask_any(later_mask);
> +	cpu = cpumask_any_distribute(later_mask);
>  	if (cpu < nr_cpu_ids)
>  		return cpu;

Think we can use cpumask_any_and_distribute() with later_mask for
deadline as well inside the for_each_domain loop as you do for rt below.

Best,
Juri

> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_st
>  				return this_cpu;
>  			}
>  
> -			best_cpu = cpumask_first_and(lowest_mask,
> -						     sched_domain_span(sd));
> +			best_cpu = cpumask_any_and_distribute(lowest_mask,
> +							      sched_domain_span(sd));
>  			if (best_cpu < nr_cpu_ids) {
>  				rcu_read_unlock();
>  				return best_cpu;
> @@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_st
>  	if (this_cpu != -1)
>  		return this_cpu;
>  
> -	cpu = cpumask_any(lowest_mask);
> +	cpu = cpumask_any_distribute(lowest_mask);
>  	if (cpu < nr_cpu_ids)
>  		return cpu;


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute()
  2020-10-06 14:09   ` Juri Lelli
@ 2020-10-06 14:20     ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06 14:20 UTC (permalink / raw)
  To: Juri Lelli
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Tue, Oct 06, 2020 at 04:09:26PM +0200, Juri Lelli wrote:

> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2001,7 +2001,7 @@ static int find_later_rq(struct task_str
> >  	if (this_cpu != -1)
> >  		return this_cpu;
> >  
> > -	cpu = cpumask_any(later_mask);
> > +	cpu = cpumask_any_distribute(later_mask);
> >  	if (cpu < nr_cpu_ids)
> >  		return cpu;
> 
> Think we can use cpumask_any_and_distribute() with later_mask for
> deadline as well inside the for_each_domain loop as you do for rt below.

Ah, indeed.. I missed it because it is cpumask_first, instead of
cpumask_any as with rt.

I folded the below.

---
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1981,8 +1981,8 @@ static int find_later_rq(struct task_str
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 13:48     ` Peter Zijlstra
@ 2020-10-06 14:37       ` Juri Lelli
  2020-10-06 14:48         ` Peter Zijlstra
  2020-10-06 16:19       ` Valentin Schneider
  1 sibling, 1 reply; 58+ messages in thread
From: Juri Lelli @ 2020-10-06 14:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Valentin Schneider, tglx, mingo, linux-kernel, bigeasy,
	qais.yousef, swood, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On 06/10/20 15:48, Peter Zijlstra wrote:
> On Tue, Oct 06, 2020 at 12:20:43PM +0100, Valentin Schneider wrote:
> > 
> > On 05/10/20 15:57, Peter Zijlstra wrote:
> > > In order to minimize the interference of migrate_disable() on lower
> > > priority tasks, which can be deprived of runtime due to being stuck
> > > below a higher priority task. Teach the RT/DL balancers to push away
> > > these higher priority tasks when a lower priority task gets selected
> > > to run on a freshly demoted CPU (pull).

Still digesting the whole lot, but can't we "simply" force push the
higest prio (that we preempt to make space for the migrate_disabled
lower prio) directly to the cpu that would accept the lower prio that
cannot move?

Asking because AFAIU we are calling find_lock_rq from push_cpu_stop and
that selects the best cpu for the high prio. I'm basically wondering if
we could avoid moving, potentially multiple, high prio tasks around to
make space for a lower prio task.

> > > This adds migration interference to the higher priority task, but
> > > restores bandwidth to system that would otherwise be irrevocably lost.
> > > Without this it would be possible to have all tasks on the system
> > > stuck on a single CPU, each task preempted in a migrate_disable()
> > > section with a single high priority task running.
> > >
> > > This way we can still approximate running the M highest priority tasks
> > > on the system.
> > >
> > 
> > Ah, so IIUC that's the important bit that makes it we can't just say go
> > through the pushable_tasks list and skip migrate_disable() tasks.
> > 
> > Once the highest-prio task exits its migrate_disable() region, your patch
> > pushes it away. If we ended up with a single busy CPU, it'll spread the
> > tasks around one migrate_enable() at a time.
> > 
> > That time where the top task is migrate_disable() is still a crappy time,
> > and as you pointed out earlier today if it is a genuine pcpu task then the
> > whole thing is -EBORKED...
> > 
> > An alternative I could see would be to prevent those piles from forming
> > altogether, say by issuing a similar push_cpu_stop() on migrate_disable()
> > if the next pushable task is already migrate_disable(); but that's a
> > proactive approach whereas yours is reactive, so I'm pretty sure that's
> > bound to perform worse.
> 
> I think it is always possible to form pileups. Just start enough tasks
> such that newer, higher priority, tasks have to preempt existing tasks.
> 
> Also, we might not be able to place the task elsewhere, suppose we have
> all our M CPUs filled with an RT task, then when the lowest priority
> task has migrate_disable(), wake the highest priority task.
> 
> Per the SMP invariant, this new highest priority task must preempt the
> lowest priority task currently running, otherwise we would not be
> running the M highest prio tasks.
> 
> That's not to say it might not still be beneficial from trying to avoid
> them, but we must assume a pilup will occur, therefore my focus was on
> dealing with them as best we can first.
> 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 14:37       ` Juri Lelli
@ 2020-10-06 14:48         ` Peter Zijlstra
  2020-10-07  5:29           ` Juri Lelli
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-06 14:48 UTC (permalink / raw)
  To: Juri Lelli
  Cc: Valentin Schneider, tglx, mingo, linux-kernel, bigeasy,
	qais.yousef, swood, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Tue, Oct 06, 2020 at 04:37:04PM +0200, Juri Lelli wrote:
> On 06/10/20 15:48, Peter Zijlstra wrote:
> > On Tue, Oct 06, 2020 at 12:20:43PM +0100, Valentin Schneider wrote:
> > > 
> > > On 05/10/20 15:57, Peter Zijlstra wrote:
> > > > In order to minimize the interference of migrate_disable() on lower
> > > > priority tasks, which can be deprived of runtime due to being stuck
> > > > below a higher priority task. Teach the RT/DL balancers to push away
> > > > these higher priority tasks when a lower priority task gets selected
> > > > to run on a freshly demoted CPU (pull).
> 
> Still digesting the whole lot, but can't we "simply" force push the
> higest prio (that we preempt to make space for the migrate_disabled
> lower prio) directly to the cpu that would accept the lower prio that
> cannot move?
> 
> Asking because AFAIU we are calling find_lock_rq from push_cpu_stop and
> that selects the best cpu for the high prio. I'm basically wondering if
> we could avoid moving, potentially multiple, high prio tasks around to
> make space for a lower prio task.

The intention was to do as you describe in the first paragraph, and
isn't pull also using find_lock_rq() to select the 'lowest' priority
runqueue to move the task to?

That is, both actions should end up at the same 'lowest' prio CPU.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute()
  2020-10-05 14:57 ` [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
  2020-10-06 14:09   ` Juri Lelli
@ 2020-10-06 15:55   ` Qais Yousef
  2020-10-07  7:22     ` Peter Zijlstra
  1 sibling, 1 reply; 58+ messages in thread
From: Qais Yousef @ 2020-10-06 15:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On 10/05/20 16:57, Peter Zijlstra wrote:
> Replace a bunch of cpumask_any*() instances with
> cpumask_any*_distribute(), by injecting this little bit of random in
> cpu selection, we reduce the chance two competing balance operations
> working off the same lowest_mask pick the same CPU.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/cpumask.h |    6 ++++++
>  kernel/sched/cpupri.c   |    4 ++--
>  kernel/sched/deadline.c |    2 +-
>  kernel/sched/rt.c       |    6 +++---
>  lib/cpumask.c           |   18 ++++++++++++++++++
>  5 files changed, 30 insertions(+), 6 deletions(-)
> 

[...]

> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_st
>  				return this_cpu;
>  			}
>  
> -			best_cpu = cpumask_first_and(lowest_mask,
> -						     sched_domain_span(sd));
> +			best_cpu = cpumask_any_and_distribute(lowest_mask,
> +							      sched_domain_span(sd));

I guess I should have done this 6 months ago and just got done with it :)

	20200414150556.10920-1-qais.yousef@arm.com

>  			if (best_cpu < nr_cpu_ids) {
>  				rcu_read_unlock();
>  				return best_cpu;
> @@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_st
>  	if (this_cpu != -1)
>  		return this_cpu;
>  
> -	cpu = cpumask_any(lowest_mask);
> +	cpu = cpumask_any_distribute(lowest_mask);
>  	if (cpu < nr_cpu_ids)
>  		return cpu;
>  
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const str
>  	return next;
>  }
>  EXPORT_SYMBOL(cpumask_any_and_distribute);
> +
> +int cpumask_any_distribute(const struct cpumask *srcp)
> +{
> +	int next, prev;
> +
> +	/* NOTE: our first selection will skip 0. */
> +	prev = __this_cpu_read(distribute_cpu_mask_prev);

We had a discussion then that __this_cpu*() variant assumes preemption being
disabled and it's safer to use this_cpu*() variant instead. Still holds true
here?

Thanks

--
Qais Yousef

> +
> +	next = cpumask_next(prev, srcp);
> +	if (next >= nr_cpu_ids)
> +		next = cpumask_first(srcp);
> +
> +	if (next < nr_cpu_ids)
> +		__this_cpu_write(distribute_cpu_mask_prev, next);
> +
> +	return next;
> +}
> +EXPORT_SYMBOL(cpumask_any_distribute);
> 
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 13:48     ` Peter Zijlstra
  2020-10-06 14:37       ` Juri Lelli
@ 2020-10-06 16:19       ` Valentin Schneider
  1 sibling, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-06 16:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 06/10/20 14:48, Peter Zijlstra wrote:
> On Tue, Oct 06, 2020 at 12:20:43PM +0100, Valentin Schneider wrote:
>>
>> An alternative I could see would be to prevent those piles from forming
>> altogether, say by issuing a similar push_cpu_stop() on migrate_disable()
>> if the next pushable task is already migrate_disable(); but that's a
>> proactive approach whereas yours is reactive, so I'm pretty sure that's
>> bound to perform worse.
>
> I think it is always possible to form pileups. Just start enough tasks
> such that newer, higher priority, tasks have to preempt existing tasks.
>
> Also, we might not be able to place the task elsewhere, suppose we have
> all our M CPUs filled with an RT task, then when the lowest priority
> task has migrate_disable(), wake the highest priority task.
>
> Per the SMP invariant, this new highest priority task must preempt the
> lowest priority task currently running, otherwise we would not be
> running the M highest prio tasks.
>

Right, and it goes the other way around for the migrate_disable() task: if
it becomes one of the M highest prio tasks, then it *must* run, and
push/pulling its CPU's current away is the only way to do so...

> That's not to say it might not still be beneficial from trying to avoid
> them, but we must assume a pilup will occur, therefore my focus was on
> dealing with them as best we can first.

"Funny" all that... Thanks!

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-06 14:48         ` Peter Zijlstra
@ 2020-10-07  5:29           ` Juri Lelli
  0 siblings, 0 replies; 58+ messages in thread
From: Juri Lelli @ 2020-10-07  5:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Valentin Schneider, tglx, mingo, linux-kernel, bigeasy,
	qais.yousef, swood, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On 06/10/20 16:48, Peter Zijlstra wrote:
> On Tue, Oct 06, 2020 at 04:37:04PM +0200, Juri Lelli wrote:
> > On 06/10/20 15:48, Peter Zijlstra wrote:
> > > On Tue, Oct 06, 2020 at 12:20:43PM +0100, Valentin Schneider wrote:
> > > > 
> > > > On 05/10/20 15:57, Peter Zijlstra wrote:
> > > > > In order to minimize the interference of migrate_disable() on lower
> > > > > priority tasks, which can be deprived of runtime due to being stuck
> > > > > below a higher priority task. Teach the RT/DL balancers to push away
> > > > > these higher priority tasks when a lower priority task gets selected
> > > > > to run on a freshly demoted CPU (pull).
> > 
> > Still digesting the whole lot, but can't we "simply" force push the
> > higest prio (that we preempt to make space for the migrate_disabled
> > lower prio) directly to the cpu that would accept the lower prio that
> > cannot move?
> > 
> > Asking because AFAIU we are calling find_lock_rq from push_cpu_stop and
> > that selects the best cpu for the high prio. I'm basically wondering if
> > we could avoid moving, potentially multiple, high prio tasks around to
> > make space for a lower prio task.
> 
> The intention was to do as you describe in the first paragraph, and
> isn't pull also using find_lock_rq() to select the 'lowest' priority
> runqueue to move the task to?
> 
> That is, both actions should end up at the same 'lowest' prio CPU.
> 

OK, right. Think there might still be differences, since successive calls
to find_lock_rq() could select different candidates (after the
introduction of cpumask_..._distribute), but that shouldn't really
matter much (also things might have changed in the meanwhile and we
really have to call find_lock_rq again I guess).


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute()
  2020-10-06 15:55   ` Qais Yousef
@ 2020-10-07  7:22     ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-07  7:22 UTC (permalink / raw)
  To: Qais Yousef
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On Tue, Oct 06, 2020 at 04:55:27PM +0100, Qais Yousef wrote:
> > +int cpumask_any_distribute(const struct cpumask *srcp)
> > +{
> > +	int next, prev;
> > +
> > +	/* NOTE: our first selection will skip 0. */
> > +	prev = __this_cpu_read(distribute_cpu_mask_prev);
> 
> We had a discussion then that __this_cpu*() variant assumes preemption being
> disabled and it's safer to use this_cpu*() variant instead. Still holds true
> here?

I think we ended up with not caring. We wanted a 'random' value, we get
a 'random' value from a 'random' CPU, still works ;-)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
  2020-10-06  7:59   ` Peter Zijlstra
  2020-10-06 11:20   ` Valentin Schneider
@ 2020-10-07 19:22   ` Valentin Schneider
  2020-10-07 21:08     ` Peter Zijlstra
  2020-10-08 10:48   ` Sebastian Andrzej Siewior
  2020-10-12  9:56   ` Dietmar Eggemann
  4 siblings, 1 reply; 58+ messages in thread
From: Valentin Schneider @ 2020-10-07 19:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 05/10/20 15:57, Peter Zijlstra wrote:
> In order to minimize the interference of migrate_disable() on lower
> priority tasks, which can be deprived of runtime due to being stuck
> below a higher priority task. Teach the RT/DL balancers to push away
> these higher priority tasks when a lower priority task gets selected
> to run on a freshly demoted CPU (pull).
>
> This adds migration interference to the higher priority task, but
> restores bandwidth to system that would otherwise be irrevocably lost.
> Without this it would be possible to have all tasks on the system
> stuck on a single CPU, each task preempted in a migrate_disable()
> section with a single high priority task running.
>
> This way we can still approximate running the M highest priority tasks
> on the system.
>
> Migrating the top task away is (ofcourse) still subject to
> migrate_disable() too, which means the lower task is subject to an
> interference equivalent to the worst case migrate_disable() section.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[...]
> @@ -1917,6 +1912,49 @@ static int migration_cpu_stop(void *data
>       return 0;
>  }
>
> +int push_cpu_stop(void *arg)
> +{
> +	struct rq *lowest_rq = NULL, *rq = this_rq();
> +	struct task_struct *p = arg;
> +
> +	raw_spin_lock_irq(&p->pi_lock);
> +	raw_spin_lock(&rq->lock);
> +
> +	if (task_rq(p) != rq)
> +		goto out_unlock;
> +
> +	if (is_migration_disabled(p)) {
> +		p->migration_flags |= MDF_PUSH;
> +		goto out_unlock;
> +	}
> +
> +	p->migration_flags &= ~MDF_PUSH;
> +
> +	if (p->sched_class->find_lock_rq)
> +		lowest_rq = p->sched_class->find_lock_rq(p, rq);
> +
> +	if (!lowest_rq)
> +		goto out_unlock;
> +
> +	// XXX validate p is still the highest prio task

So we want to move some !migrate_disable(), highest priority task to make
room for a migrate_disable(), lower priority task. We're working with the
task that was highest prio at the time of kicking the CPU stopper
(i.e. back when we invoked get_push_task()), but as you point out here it
might no longer be of highest prio.

I was thinking that since this is done in stopper context we could peek at
what actually is the current (!stopper) highest prio task, but that implies
first grabbing the rq lock and *then* some p->pi_lock, which is a no-no :(

Regarding the check, I've cobbled the below. I'm not fond of the implicit
expectation that p will always be > CFS, but there's no CFS .find_lock_rq()
(... for now).

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 69b1173eaf45..3ed339980f87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1919,6 +1919,7 @@ static int migration_cpu_stop(void *data)
 int push_cpu_stop(void *arg)
 {
        struct rq *lowest_rq = NULL, *rq = this_rq();
+	const struct sched_class *class;
        struct task_struct *p = arg;

        raw_spin_lock_irq(&p->pi_lock);
@@ -1940,14 +1941,23 @@ int push_cpu_stop(void *arg)
        if (!lowest_rq)
                goto out_unlock;

-	// XXX validate p is still the highest prio task
-	if (task_rq(p) == rq) {
-		deactivate_task(rq, p, 0);
-		set_task_cpu(p, lowest_rq->cpu);
-		activate_task(lowest_rq, p, 0);
-		resched_curr(lowest_rq);
+	// Validate p is still the highest prio task
+	for_class_range(class, &stop_sched_class - 1, p->sched_class - 1) {
+		struct task_struct *curr = class->peek_next_task(rq);
+
+		if (!curr)
+			continue;
+		if (curr != p)
+			goto out_unlock;
+		else
+			break;
        }

+	deactivate_task(rq, p, 0);
+	set_task_cpu(p, lowest_rq->cpu);
+	activate_task(lowest_rq, p, 0);
+	resched_curr(lowest_rq);
+
        double_unlock_balance(rq, lowest_rq);

 out_unlock:
@@ -7312,7 +7322,7 @@ int sched_cpu_deactivate(unsigned int cpu)

        rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
-		update_rq_clock();
+		update_rq_clock(rq);
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_offline(rq);
        }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15320ede2f45..7964c42b8604 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1840,6 +1840,19 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
        return p;
 }

+static struct task_struct *peek_next_task_dl(struct rq *rq)
+{
+	struct sched_dl_entity *dl_se;
+	struct dl_rq *dl_rq = &rq->dl;
+
+	if (!sched_dl_runnable(rq))
+		return NULL;
+
+	dl_se = pick_next_dl_entity(rq, dl_rq);
+	BUG_ON(!dl_se);
+	return dl_task_of(dl_se);
+}
+
 static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
        update_curr_dl(rq);
@@ -2498,6 +2511,8 @@ const struct sched_class dl_sched_class
        .check_preempt_curr	= check_preempt_curr_dl,

        .pick_next_task		= pick_next_task_dl,
+	.peek_next_task         = peek_next_task_dl,
+
        .put_prev_task		= put_prev_task_dl,
        .set_next_task		= set_next_task_dl,

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e90a69b3e85c..83949e9018a3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1636,6 +1636,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
        return p;
 }

+static struct task_struct *peek_next_task_rt(struct rq *rq)
+{
+	if (!sched_rt_runnable(rq))
+		return NULL;
+
+	return _pick_next_task_rt(rq);
+}
+
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
@@ -2479,6 +2487,8 @@ const struct sched_class rt_sched_class
        .check_preempt_curr	= check_preempt_curr_rt,

        .pick_next_task		= pick_next_task_rt,
+	.peek_next_task         = peek_next_task_rt,
+
        .put_prev_task		= put_prev_task_rt,
        .set_next_task          = set_next_task_rt,

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2621155393c..7cd3b8682375 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1811,6 +1811,7 @@ struct sched_class {
        void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);

        struct task_struct *(*pick_next_task)(struct rq *rq);
+	struct task_struct *(*peek_next_task)(struct rq *rq);

        void (*put_prev_task)(struct rq *rq, struct task_struct *p);
        void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-07 19:22   ` Valentin Schneider
@ 2020-10-07 21:08     ` Peter Zijlstra
  2020-10-07 22:32       ` Valentin Schneider
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-07 21:08 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On Wed, Oct 07, 2020 at 08:22:44PM +0100, Valentin Schneider wrote:
> +		struct task_struct *curr = class->peek_next_task(rq);

If you look at the core-sched patches they have something very similar
:-)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-07 21:08     ` Peter Zijlstra
@ 2020-10-07 22:32       ` Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-07 22:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 07/10/20 22:08, Peter Zijlstra wrote:
> On Wed, Oct 07, 2020 at 08:22:44PM +0100, Valentin Schneider wrote:
>> +		struct task_struct *curr = class->peek_next_task(rq);
>
> If you look at the core-sched patches they have something very similar
> :-)

Shiny, there's the fair thing I chickened out of doing all prechewed!

I was trying very hard to forget about that series, seems like I did partly
succeed in that :)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
                     ` (2 preceding siblings ...)
  2020-10-07 19:22   ` Valentin Schneider
@ 2020-10-08 10:48   ` Sebastian Andrzej Siewior
  2020-10-12  9:56   ` Dietmar Eggemann
  4 siblings, 0 replies; 58+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-08 10:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

On 2020-10-05 16:57:32 [+0200], Peter Zijlstra wrote:
> In order to minimize the interference of migrate_disable() on lower
> priority tasks, which can be deprived of runtime due to being stuck
> below a higher priority task. Teach the RT/DL balancers to push away
> these higher priority tasks when a lower priority task gets selected
> to run on a freshly demoted CPU (pull).
> 
> This adds migration interference to the higher priority task, but
> restores bandwidth to system that would otherwise be irrevocably lost.
> Without this it would be possible to have all tasks on the system
> stuck on a single CPU, each task preempted in a migrate_disable()
> section with a single high priority task running.

So there is a task running at priority 99.9 and then scheduler decides
to interrupt it while pushing it to a new CPU? But this does happen if
the task is pinned to one CPU. Then this shouldn't do much harm.

Usually the tasks with high priority are pinned to a single CPU because
otherwise it causes noise/latency when the scheduler bounces it to a
different CPUs.
Then there are the cases where the first lock limits the CPU mask and
the second lock is occupied. After the lock has been released the task
can't acquire it because the CPU is occupied by a task with higher
priority. This would be a win if the high-prio task would move to
another CPU if possible.

Sebastian

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-05 14:57 ` [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
  2020-10-06 12:46   ` Vincent Guittot
@ 2020-10-09 20:41   ` Dietmar Eggemann
  2020-10-12 12:52     ` Peter Zijlstra
  1 sibling, 1 reply; 58+ messages in thread
From: Dietmar Eggemann @ 2020-10-09 20:41 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

On 05/10/2020 16:57, Peter Zijlstra wrote:
> Since we now migrate tasks away before DYING, we should also move
> bandwidth unthrottle, otherwise we can gain tasks from unthrottle
> after we expect all tasks to be gone already.
> 
> Also; it looks like the RT balancers don't respect cpu_active() and
> instead rely on rq->online in part, complete this. This too requires
> we do set_rq_offline() earlier to match the cpu_active() semantics.
> (The bigger patch is to convert RT to cpu_active() entirely)
> 
> Since set_rq_online() is called from sched_cpu_activate(), place
> set_rq_offline() in sched_cpu_deactivate().

[   76.215229] WARNING: CPU: 1 PID: 1913 at kernel/irq_work.c:95
irq_work_queue_on+0x108/0x110
[   76.223589] Modules linked in:
[   76.226646] CPU: 1 PID: 1913 Comm: task0-1 Not tainted
5.9.0-rc1-00159-g231df48234cb-dirty #40
[   76.235268] Hardware name: ARM Juno development board (r0) (DT)
[   76.241194] pstate: 60000085 (nZCv daIf -PAN -UAO BTYPE=--)
[   76.246772] pc : irq_work_queue_on+0x108/0x110
[   76.251220] lr : pull_rt_task+0x58/0x68
[   76.255577] sp : ffff800013a83be0
[   76.258890] x29: ffff800013a83be0 x28: ffff000972f34600
[   76.264208] x27: ffff000972f34b90 x26: ffff8000114156c0
[   76.269524] x25: 0080000000000000 x24: ffff800011cd7000
[   76.274840] x23: ffff000972f34600 x22: ffff800010d627c8
[   76.280157] x21: 0000000000000000 x20: 0000000000000000
[   76.285473] x19: ffff00097ef701c0 x18: 0000000000000010
[   76.290788] x17: 0000000000000000 x16: 0000000000000000
[   76.296104] x15: ffff000972f34a80 x14: ffffffffffffffff
[   76.301420] x13: ffff800093a83987 x12: ffff800013a8398f
[   76.306736] x11: ffff800011ac2000 x10: ffff800011ce8690
[   76.312051] x9 : 0000000000000000 x8 : ffff800011ce9000
[   76.317367] x7 : ffff8000106e9bb8 x6 : 0000000000000144
[   76.322682] x5 : 0000000000000000 x4 : ffff800011aaa1c0
[   76.327998] x3 : 0000000000000000 x2 : 0000000000000000
[   76.333314] x1 : ffff800011ce72a0 x0 : 0000000000000002
[   76.338630] Call trace:
[   76.341076]  irq_work_queue_on+0x108/0x110
[   76.349185]  pull_rt_task+0x58/0x68
[   76.352673]  balance_rt+0x84/0x88
[   76.355990]  __schedule+0x4a4/0x670
[   76.359478]  schedule+0x70/0x108
[   76.362706]  do_nanosleep+0x8c/0x178
[   76.366283]  hrtimer_nanosleep+0xa0/0x118
[   76.370294]  common_nsleep_timens+0x68/0x98
[   76.374479]  __arm64_sys_clock_nanosleep+0xc0/0x138
[   76.379361]  el0_svc_common.constprop.0+0x6c/0x168
[   76.384155]  do_el0_svc+0x24/0x90
[   76.387471]  el0_sync_handler+0x90/0x198
[   76.391394]  el0_sync+0x158/0x180


balance_rt() checks via need_pull_rt_task() that rq is online but it
looks like that with RT_PUSH_IPI pull_rt_task() -> tell_cpu_to_push()
calls irq_work_queue_on() with cpu = rto_next_cpu(rq->rd) and this one
can be offline here as well now.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
                     ` (3 preceding siblings ...)
  2020-10-08 10:48   ` Sebastian Andrzej Siewior
@ 2020-10-12  9:56   ` Dietmar Eggemann
  2020-10-12 11:28     ` Peter Zijlstra
  4 siblings, 1 reply; 58+ messages in thread
From: Dietmar Eggemann @ 2020-10-12  9:56 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

On 05/10/2020 16:57, Peter Zijlstra wrote:

[...]

> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
>   * running task can migrate over to a CPU that is running a task
>   * of lesser priority.
>   */
> -static int push_rt_task(struct rq *rq)
> +static int push_rt_task(struct rq *rq, bool pull)
>  {
>  	struct task_struct *next_task;
>  	struct rq *lowest_rq;
> @@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
>  		return 0;
>  
>  retry:
> +	if (is_migration_disabled(next_task)) {
> +		struct task_struct *push_task = NULL;
> +		int cpu;
> +
> +		if (!pull || rq->push_busy)
> +			return 0;

Shouldn't there be the same functionality in push_dl_task(), i.e.
returning 0 earlier for a task with migration_disabled?

[...]

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-12  9:56   ` Dietmar Eggemann
@ 2020-10-12 11:28     ` Peter Zijlstra
  2020-10-12 12:22       ` Dietmar Eggemann
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-12 11:28 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Mon, Oct 12, 2020 at 11:56:09AM +0200, Dietmar Eggemann wrote:
> On 05/10/2020 16:57, Peter Zijlstra wrote:
> 
> [...]
> 
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
> >   * running task can migrate over to a CPU that is running a task
> >   * of lesser priority.
> >   */
> > -static int push_rt_task(struct rq *rq)
> > +static int push_rt_task(struct rq *rq, bool pull)
> >  {
> >  	struct task_struct *next_task;
> >  	struct rq *lowest_rq;
> > @@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
> >  		return 0;
> >  
> >  retry:
> > +	if (is_migration_disabled(next_task)) {
> > +		struct task_struct *push_task = NULL;
> > +		int cpu;
> > +
> > +		if (!pull || rq->push_busy)
> > +			return 0;
> 
> Shouldn't there be the same functionality in push_dl_task(), i.e.
> returning 0 earlier for a task with migration_disabled?

No, deadline didn't implement HAVE_RT_PUSH_IPI.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
       [not found]   ` <CH2PR14MB41833F828B4D3BA5A7B6CE7B9A0B0@CH2PR14MB4183.namprd14.prod.outlook.com>
@ 2020-10-12 11:36     ` Peter Zijlstra
       [not found]       ` <CH2PR14MB4183BF26F02A4AD4F45CADA59A070@CH2PR14MB4183.namprd14.prod.outlook.com>
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-12 11:36 UTC (permalink / raw)
  To: Tao Zhou
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

On Fri, Oct 09, 2020 at 01:22:00AM +0800, Tao Zhou wrote:
> On Mon, Oct 05, 2020 at 04:57:27PM +0200, Peter Zijlstra wrote:
> > +/*
> > + * This function is wildly self concurrent, consider at least 3 times.
> > + */
> 
> More than that

Probably. I meant to write a coherent comment, but it got very long and
not so very coherent.

It should probably enumerate all the various cases with diagrams like
those in this email:

  https://lkml.kernel.org/r/jhj3637lzdm.mognet@arm.com

So we have:

 - set_affinity() vs migrate_disable()
 - set_affinity() + N*set_affinity() vs migrate_disable()
   (ie. N additional waiters)
 - set_affinity() vs migrate_disable() vs set_affinity()
   (ie. the case from the above email)

And possibly some others.

If you have cycles to spend on writing that comment, that'd be great,
otherwise it'll stay on the todo list for a little while longer.

> > +static int affine_move_task(struct rq *rq, struct rq_flags *rf,
> > +			    struct task_struct *p, int dest_cpu, unsigned int flags)
> > +{
> > +	struct set_affinity_pending my_pending = { }, *pending = NULL;
> > +	struct migration_arg arg = {
> > +		.task = p,
> > +		.dest_cpu = dest_cpu,
> > +	};
> > +	bool complete = false;
> > +
> > +	/* Can the task run on the task's current CPU? If so, we're done */
> > +	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
> > +		pending = p->migration_pending;
> > +		if (pending) {
> > +			p->migration_pending = NULL;
> > +			complete = true;
> > +		}
> > +		task_rq_unlock(rq, p, rf);
> > +
> > +		if (complete)
> > +			goto do_complete;
> > +
> > +		return 0;
> > +	}
> > +
> > +	if (!(flags & SCA_MIGRATE_ENABLE)) {
> > +		/* serialized by p->pi_lock */
> > +		if (!p->migration_pending) {
> > +			refcount_set(&my_pending.refs, 1);
> > +			init_completion(&my_pending.done);
> > +			p->migration_pending = &my_pending;
> > +		} else {
> > +			pending = p->migration_pending;
> 
> The above load can be omited, no ?
> 
> > +			refcount_inc(&pending->refs);

That would put ^ in trouble...

> > +		}
> > +	}
> > +	pending = p->migration_pending;

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-12 11:28     ` Peter Zijlstra
@ 2020-10-12 12:22       ` Dietmar Eggemann
  0 siblings, 0 replies; 58+ messages in thread
From: Dietmar Eggemann @ 2020-10-12 12:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On 12/10/2020 13:28, Peter Zijlstra wrote:
> On Mon, Oct 12, 2020 at 11:56:09AM +0200, Dietmar Eggemann wrote:
>> On 05/10/2020 16:57, Peter Zijlstra wrote:
>>
>> [...]
>>
>>> --- a/kernel/sched/rt.c
>>> +++ b/kernel/sched/rt.c
>>> @@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
>>>   * running task can migrate over to a CPU that is running a task
>>>   * of lesser priority.
>>>   */
>>> -static int push_rt_task(struct rq *rq)
>>> +static int push_rt_task(struct rq *rq, bool pull)
>>>  {
>>>  	struct task_struct *next_task;
>>>  	struct rq *lowest_rq;
>>> @@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
>>>  		return 0;
>>>  
>>>  retry:
>>> +	if (is_migration_disabled(next_task)) {
>>> +		struct task_struct *push_task = NULL;
>>> +		int cpu;
>>> +
>>> +		if (!pull || rq->push_busy)
>>> +			return 0;
>>
>> Shouldn't there be the same functionality in push_dl_task(), i.e.
>> returning 0 earlier for a task with migration_disabled?
> 
> No, deadline didn't implement HAVE_RT_PUSH_IPI. 

Right, so 'is_migration_disabled(next_task) && !pull' should never
happen then (for RT and DL).

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-09 20:41   ` Dietmar Eggemann
@ 2020-10-12 12:52     ` Peter Zijlstra
  2020-10-12 13:18       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-12 12:52 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Fri, Oct 09, 2020 at 10:41:11PM +0200, Dietmar Eggemann wrote:
> On 05/10/2020 16:57, Peter Zijlstra wrote:
> > Since we now migrate tasks away before DYING, we should also move
> > bandwidth unthrottle, otherwise we can gain tasks from unthrottle
> > after we expect all tasks to be gone already.
> > 
> > Also; it looks like the RT balancers don't respect cpu_active() and
> > instead rely on rq->online in part, complete this. This too requires
> > we do set_rq_offline() earlier to match the cpu_active() semantics.
> > (The bigger patch is to convert RT to cpu_active() entirely)
> > 
> > Since set_rq_online() is called from sched_cpu_activate(), place
> > set_rq_offline() in sched_cpu_deactivate().

> [   76.215229] WARNING: CPU: 1 PID: 1913 at kernel/irq_work.c:95 irq_work_queue_on+0x108/0x110

> [   76.341076]  irq_work_queue_on+0x108/0x110
> [   76.349185]  pull_rt_task+0x58/0x68
> [   76.352673]  balance_rt+0x84/0x88

> balance_rt() checks via need_pull_rt_task() that rq is online but it
> looks like that with RT_PUSH_IPI pull_rt_task() -> tell_cpu_to_push()
> calls irq_work_queue_on() with cpu = rto_next_cpu(rq->rd) and this one
> can be offline here as well now.

Hurmph... so if I read this right, we reach offline with overload set?

Oooh, I think I see how that happens..

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-12 12:52     ` Peter Zijlstra
@ 2020-10-12 13:18       ` Peter Zijlstra
  2020-10-12 14:14         ` Dietmar Eggemann
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-12 13:18 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Mon, Oct 12, 2020 at 02:52:00PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 09, 2020 at 10:41:11PM +0200, Dietmar Eggemann wrote:
> > On 05/10/2020 16:57, Peter Zijlstra wrote:
> > > Since we now migrate tasks away before DYING, we should also move
> > > bandwidth unthrottle, otherwise we can gain tasks from unthrottle
> > > after we expect all tasks to be gone already.
> > > 
> > > Also; it looks like the RT balancers don't respect cpu_active() and
> > > instead rely on rq->online in part, complete this. This too requires
> > > we do set_rq_offline() earlier to match the cpu_active() semantics.
> > > (The bigger patch is to convert RT to cpu_active() entirely)
> > > 
> > > Since set_rq_online() is called from sched_cpu_activate(), place
> > > set_rq_offline() in sched_cpu_deactivate().
> 
> > [   76.215229] WARNING: CPU: 1 PID: 1913 at kernel/irq_work.c:95 irq_work_queue_on+0x108/0x110
> 
> > [   76.341076]  irq_work_queue_on+0x108/0x110
> > [   76.349185]  pull_rt_task+0x58/0x68
> > [   76.352673]  balance_rt+0x84/0x88
> 
> > balance_rt() checks via need_pull_rt_task() that rq is online but it
> > looks like that with RT_PUSH_IPI pull_rt_task() -> tell_cpu_to_push()
> > calls irq_work_queue_on() with cpu = rto_next_cpu(rq->rd) and this one
> > can be offline here as well now.
> 
> Hurmph... so if I read this right, we reach offline with overload set?
> 
> Oooh, I think I see how that happens..

I think the below two hunks need to be reverted from the patch. Can you
confirm?

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2326,9 +2326,6 @@ static void rq_online_dl(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_dl(struct rq *rq)
 {
-	if (rq->dl.overloaded)
-		dl_clear_overload(rq);
-
 	cpudl_clear(&rq->rd->cpudl, rq->cpu);
 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2245,9 +2245,6 @@ static void rq_online_rt(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_rt(struct rq *rq)
 {
-	if (rq->rt.overloaded)
-		rt_clear_overload(rq);
-
 	__disable_runtime(rq);
 
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control
  2020-10-12 13:18       ` Peter Zijlstra
@ 2020-10-12 14:14         ` Dietmar Eggemann
  0 siblings, 0 replies; 58+ messages in thread
From: Dietmar Eggemann @ 2020-10-12 14:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On 12/10/2020 15:18, Peter Zijlstra wrote:
> On Mon, Oct 12, 2020 at 02:52:00PM +0200, Peter Zijlstra wrote:
>> On Fri, Oct 09, 2020 at 10:41:11PM +0200, Dietmar Eggemann wrote:
>>> On 05/10/2020 16:57, Peter Zijlstra wrote:
>>>> Since we now migrate tasks away before DYING, we should also move
>>>> bandwidth unthrottle, otherwise we can gain tasks from unthrottle
>>>> after we expect all tasks to be gone already.
>>>>
>>>> Also; it looks like the RT balancers don't respect cpu_active() and
>>>> instead rely on rq->online in part, complete this. This too requires
>>>> we do set_rq_offline() earlier to match the cpu_active() semantics.
>>>> (The bigger patch is to convert RT to cpu_active() entirely)
>>>>
>>>> Since set_rq_online() is called from sched_cpu_activate(), place
>>>> set_rq_offline() in sched_cpu_deactivate().
>>
>>> [   76.215229] WARNING: CPU: 1 PID: 1913 at kernel/irq_work.c:95 irq_work_queue_on+0x108/0x110
>>
>>> [   76.341076]  irq_work_queue_on+0x108/0x110
>>> [   76.349185]  pull_rt_task+0x58/0x68
>>> [   76.352673]  balance_rt+0x84/0x88
>>
>>> balance_rt() checks via need_pull_rt_task() that rq is online but it
>>> looks like that with RT_PUSH_IPI pull_rt_task() -> tell_cpu_to_push()
>>> calls irq_work_queue_on() with cpu = rto_next_cpu(rq->rd) and this one
>>> can be offline here as well now.
>>
>> Hurmph... so if I read this right, we reach offline with overload set?
>>
>> Oooh, I think I see how that happens..
> 
> I think the below two hunks need to be reverted from the patch. Can you
> confirm?
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2326,9 +2326,6 @@ static void rq_online_dl(struct rq *rq)
>  /* Assumes rq->lock is held */
>  static void rq_offline_dl(struct rq *rq)
>  {
> -	if (rq->dl.overloaded)
> -		dl_clear_overload(rq);
> -
>  	cpudl_clear(&rq->rd->cpudl, rq->cpu);
>  	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
>  }
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2245,9 +2245,6 @@ static void rq_online_rt(struct rq *rq)
>  /* Assumes rq->lock is held */
>  static void rq_offline_rt(struct rq *rq)
>  {
> -	if (rq->rt.overloaded)
> -		rt_clear_overload(rq);
> -
>  	__disable_runtime(rq);
>  
>  	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
> 

Yes, this seems to fix it. Tested with RT testcase for > 20min. This
issue did appear within 30 secs w/o this fix.

Looks like that we now bail out of pull_rt_task() in one of the
rt_overload_count related conditions before we call RT_PUSH_IPI
functionality (tell_cpu_to_push()).

Debug snippet w/o this fix with extra output in tell_cpu_to_push():

[  128.608880] sched: RT throttling activated
[  204.240351] CPU2 cpu=0 is offline rt_overloaded=1
[  204.245069] ------------[ cut here ]------------
[  204.249697] WARNING: CPU: 2 PID: 19 at kernel/irq_work.c:95
irq_work_queue_on+0x108/0x110

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
       [not found]       ` <CH2PR14MB4183BF26F02A4AD4F45CADA59A070@CH2PR14MB4183.namprd14.prod.outlook.com>
@ 2020-10-13 13:20         ` Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-13 13:20 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj


On 12/10/20 19:25, Tao Zhou wrote:
> Hi Peter,
> @@ -1989,7 +1994,24 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
>         };
>         bool complete = false;
>
> -       /* Can the task run on the task's current CPU? If so, we're done */
> +       /*
> +        * Can the task run on the task's current CPU ? If so,  we're done.
> +        * One scenario can happen here. Consider task T running on CPU P0:
> +        *
> +        *   P0                     P1                    p2

That naming convention weirds me out, and I'm the one who wrote that...

How about:

  https://gitlab.arm.com/linux-arm/linux-vs/-/commit/b30f50c7b2dd1ef70598bbe1a2a90934d894b861

I also have an extra goodie:

  https://gitlab.arm.com/linux-arm/linux-vs/-/commits/mainline/review/migrate_disable_peterz_v2_extras/

The whole thing can be cloned via
git@git.gitlab.arm.com:linux-arm/linux-vs.git -b mainline/review/migrate_disable_peterz_v2_extras

(I can send those as replies to patch 0 if that helps)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (16 preceding siblings ...)
  2020-10-05 14:57 ` [PATCH -v2 17/17] sched: Add migrate_disable() tracepoints Peter Zijlstra
@ 2020-10-13 14:01 ` Valentin Schneider
  2020-10-13 14:15   ` Sebastian Andrzej Siewior
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
  2020-10-13 14:01 ` [PATCH 2/2] sched: Comment affine_move_task() Valentin Schneider
  18 siblings, 2 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-13 14:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: tglx, mingo, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

  migrate_disable();
  set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
  affine_move_task(); <-- never returns

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ccd1099adaa..7f4e38819de1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2189,6 +2189,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
+	if (p == current &&
+	    is_migration_disabled(p) &&
+	    !cpumask_test_cpu(task_cpu(p), new_mask))
+		ret = -EBUSY;
+
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity
 	 * for groups of tasks (ie. cpuset), so that load balancing is not
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [PATCH 2/2] sched: Comment affine_move_task()
  2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
                   ` (17 preceding siblings ...)
  2020-10-13 14:01 ` [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Valentin Schneider
@ 2020-10-13 14:01 ` Valentin Schneider
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
  18 siblings, 1 reply; 58+ messages in thread
From: Valentin Schneider @ 2020-10-13 14:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: tglx, mingo, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f4e38819de1..8cf74ba49d61 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2032,7 +2032,75 @@ struct set_affinity_pending {
 };
 
 /*
- * This function is wildly self concurrent, consider at least 3 times.
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded an uninstallion of
+ * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 			    struct task_struct *p, int dest_cpu, unsigned int flags)
@@ -2075,6 +2143,7 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	if (!(flags & SCA_MIGRATE_ENABLE)) {
 		/* serialized by p->pi_lock */
 		if (!p->migration_pending) {
+			/* Install the request */
 			refcount_set(&my_pending.refs, 1);
 			init_completion(&my_pending.done);
 			p->migration_pending = &my_pending;
@@ -2113,7 +2182,11 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	}
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
-
+		/*
+		 * Lessen races (and headaches) by delegating
+		 * is_migration_disabled(p) checks to the stopper, which will
+		 * run on the same CPU as said p.
+		 */
 		task_rq_unlock(rq, p, rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 
@@ -2138,6 +2211,10 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	if (refcount_dec_and_test(&pending->refs))
 		wake_up_var(&pending->refs);
 
+	/*
+	 * Block the original owner of &pending until all subsequent callers
+	 * have seen the completion and decremented the refcount
+	 */
 	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
 
 	return 0;
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-13 14:01 ` [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Valentin Schneider
@ 2020-10-13 14:15   ` Sebastian Andrzej Siewior
  2020-10-13 14:21     ` Peter Zijlstra
  2020-10-13 14:22     ` Valentin Schneider
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
  1 sibling, 2 replies; 58+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-13 14:15 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: linux-kernel, tglx, mingo, qais.yousef, swood, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj

On 2020-10-13 15:01:15 [+0100], Valentin Schneider wrote:
>   migrate_disable();
>   set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
>   affine_move_task(); <-- never returns
> 
> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
> ---
>  kernel/sched/core.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4ccd1099adaa..7f4e38819de1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2189,6 +2189,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>  	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
>  		goto out;
>  
> +	if (p == current &&
> +	    is_migration_disabled(p) &&
> +	    !cpumask_test_cpu(task_cpu(p), new_mask))
> +		ret = -EBUSY;
> +

This shouldn't happen, right? The function may sleep so it shouldn't be
entered with disabled migration. A WARN_ON might spot the bad caller.

Sebastian

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-13 14:15   ` Sebastian Andrzej Siewior
@ 2020-10-13 14:21     ` Peter Zijlstra
  2020-10-15  9:14       ` Valentin Schneider
  2020-10-13 14:22     ` Valentin Schneider
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2020-10-13 14:21 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Valentin Schneider, linux-kernel, tglx, mingo, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj

On Tue, Oct 13, 2020 at 04:15:08PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-10-13 15:01:15 [+0100], Valentin Schneider wrote:
> >   migrate_disable();
> >   set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
> >   affine_move_task(); <-- never returns
> > 
> > Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
> > ---
> >  kernel/sched/core.c | 5 +++++
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 4ccd1099adaa..7f4e38819de1 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2189,6 +2189,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> >  	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
> >  		goto out;
> >  
> > +	if (p == current &&
> > +	    is_migration_disabled(p) &&
> > +	    !cpumask_test_cpu(task_cpu(p), new_mask))
> > +		ret = -EBUSY;
> > +
> 
> This shouldn't happen, right? The function may sleep so it shouldn't be
> entered with disabled migration. A WARN_ON might spot the bad caller.

So yeah, I like detecting the case but agree with bigeasy that an
additional WARN would make sense, lemme go add that.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-13 14:15   ` Sebastian Andrzej Siewior
  2020-10-13 14:21     ` Peter Zijlstra
@ 2020-10-13 14:22     ` Valentin Schneider
  1 sibling, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-13 14:22 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-kernel, tglx, mingo, qais.yousef, swood, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj


On 13/10/20 15:15, Sebastian Andrzej Siewior wrote:
> On 2020-10-13 15:01:15 [+0100], Valentin Schneider wrote:
>>   migrate_disable();
>>   set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
>>   affine_move_task(); <-- never returns
>>
>> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
>> ---
>>  kernel/sched/core.c | 5 +++++
>>  1 file changed, 5 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 4ccd1099adaa..7f4e38819de1 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2189,6 +2189,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>>      if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
>>              goto out;
>>
>> +	if (p == current &&
>> +	    is_migration_disabled(p) &&
>> +	    !cpumask_test_cpu(task_cpu(p), new_mask))
>> +		ret = -EBUSY;
>> +
>
> This shouldn't happen, right? The function may sleep so it shouldn't be
> entered with disabled migration. A WARN_ON might spot the bad caller.
>

Yeah, this is one of those paranoia-driven checks.

> Sebastian

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-13 14:21     ` Peter Zijlstra
@ 2020-10-15  9:14       ` Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: Valentin Schneider @ 2020-10-15  9:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Sebastian Andrzej Siewior, linux-kernel, tglx, mingo,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj


On 13/10/20 15:21, Peter Zijlstra wrote:
> On Tue, Oct 13, 2020 at 04:15:08PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2020-10-13 15:01:15 [+0100], Valentin Schneider wrote:
>> >   migrate_disable();
>> >   set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
>> >   affine_move_task(); <-- never returns
>> >
>> > Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
>> > ---
>> >  kernel/sched/core.c | 5 +++++
>> >  1 file changed, 5 insertions(+)
>> >
>> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> > index 4ccd1099adaa..7f4e38819de1 100644
>> > --- a/kernel/sched/core.c
>> > +++ b/kernel/sched/core.c
>> > @@ -2189,6 +2189,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>> >    if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
>> >            goto out;
>> >
>> > +	if (p == current &&
>> > +	    is_migration_disabled(p) &&
>> > +	    !cpumask_test_cpu(task_cpu(p), new_mask))
>> > +		ret = -EBUSY;
>> > +
>>
>> This shouldn't happen, right? The function may sleep so it shouldn't be
>> entered with disabled migration. A WARN_ON might spot the bad caller.
>
> So yeah, I like detecting the case but agree with bigeasy that an
> additional WARN would make sense, lemme go add that.

Err, I've just realized that this will warn on migrate_enable() if there
are pending affinity changes, since p->migration_disabled is cleared
*after* the call to __set_cpus_allowed_ptr().

This wants:
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d503d6cb8350..e8156e3d3b4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2266,7 +2266,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
                goto out;

-	if (WARN_ON_ONCE(p == current &&
+	if (WARN_ON_ONCE(!(flags & SCA_MIGRATE_ENABLE) &&
+			 p == current &&
                         is_migration_disabled(p) &&
                         !cpumask_test_cpu(task_cpu(p), new_mask)))
                ret = -EBUSY;
---

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip: sched/core] sched: Comment affine_move_task()
  2020-10-13 14:01 ` [PATCH 2/2] sched: Comment affine_move_task() Valentin Schneider
@ 2020-11-11  8:23   ` tip-bot2 for Valentin Schneider
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Valentin Schneider, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     c777d847107e80df24dae87fc9cf4b4c0bf4dfed
Gitweb:        https://git.kernel.org/tip/c777d847107e80df24dae87fc9cf4b4c0bf4dfed
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Tue, 13 Oct 2020 15:01:16 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:02 +01:00

sched: Comment affine_move_task()

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com
---
 kernel/sched/core.c | 81 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88c6fcb..c6409f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2076,7 +2076,75 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 }
 
 /*
- * This function is wildly self concurrent, consider at least 3 times.
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
 			    int dest_cpu, unsigned int flags)
@@ -2120,6 +2188,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 	if (!(flags & SCA_MIGRATE_ENABLE)) {
 		/* serialized by p->pi_lock */
 		if (!p->migration_pending) {
+			/* Install the request */
 			refcount_set(&my_pending.refs, 1);
 			init_completion(&my_pending.done);
 			p->migration_pending = &my_pending;
@@ -2165,7 +2234,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 	}
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
-
+		/*
+		 * Lessen races (and headaches) by delegating
+		 * is_migration_disabled(p) checks to the stopper, which will
+		 * run on the same CPU as said p.
+		 */
 		task_rq_unlock(rq, p, rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 
@@ -2190,6 +2263,10 @@ do_complete:
 	if (refcount_dec_and_test(&pending->refs))
 		wake_up_var(&pending->refs);
 
+	/*
+	 * Block the original owner of &pending until all subsequent callers
+	 * have seen the completion and decremented the refcount
+	 */
 	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
 
 	return 0;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip: sched/core] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-13 14:01 ` [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Valentin Schneider
  2020-10-13 14:15   ` Sebastian Andrzej Siewior
@ 2020-11-11  8:23   ` tip-bot2 for Valentin Schneider
  1 sibling, 0 replies; 58+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Valentin Schneider, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     885b3ba47aa5cc16550beb8a42181ad5e8302ceb
Gitweb:        https://git.kernel.org/tip/885b3ba47aa5cc16550beb8a42181ad5e8302ceb
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Tue, 13 Oct 2020 15:01:15 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:02 +01:00

sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()

  migrate_disable();
  set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
  affine_move_task(); <-- never returns

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com
---
 kernel/sched/core.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e92d785..88c6fcb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2238,8 +2238,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
-		goto out;
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		if (cpumask_equal(&p->cpus_mask, new_mask))
+			goto out;
+
+		if (WARN_ON_ONCE(p == current &&
+				 is_migration_disabled(p) &&
+				 !cpumask_test_cpu(task_cpu(p), new_mask))) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
 
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity

^ permalink raw reply related	[flat|nested] 58+ messages in thread

end of thread, other threads:[~2020-11-11  8:25 UTC | newest]

Thread overview: 58+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-05 14:57 [PATCH -v2 00/17] sched: Migrate disable support Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 01/17] stop_machine: Add function and caller debug info Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 02/17] sched: Fix balance_callback() Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 03/17] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
2020-10-06  7:25   ` Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 04/17] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 05/17] workqueue: Manually break affinity " Peter Zijlstra
2020-10-05 16:31   ` Tejun Heo
2020-10-05 14:57 ` [PATCH -v2 06/17] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 07/17] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
2020-10-06 12:46   ` Vincent Guittot
2020-10-06 13:33     ` Peter Zijlstra
2020-10-09 20:41   ` Dietmar Eggemann
2020-10-12 12:52     ` Peter Zijlstra
2020-10-12 13:18       ` Peter Zijlstra
2020-10-12 14:14         ` Dietmar Eggemann
2020-10-05 14:57 ` [PATCH -v2 08/17] sched: Massage set_cpus_allowed() Peter Zijlstra
2020-10-06 11:20   ` Valentin Schneider
2020-10-05 14:57 ` [PATCH -v2 09/17] sched: Add migrate_disable() Peter Zijlstra
2020-10-06 11:20   ` Valentin Schneider
2020-10-05 14:57 ` [PATCH -v2 10/17] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
     [not found]   ` <CH2PR14MB41833F828B4D3BA5A7B6CE7B9A0B0@CH2PR14MB4183.namprd14.prod.outlook.com>
2020-10-12 11:36     ` Peter Zijlstra
     [not found]       ` <CH2PR14MB4183BF26F02A4AD4F45CADA59A070@CH2PR14MB4183.namprd14.prod.outlook.com>
2020-10-13 13:20         ` Valentin Schneider
2020-10-05 14:57 ` [PATCH -v2 11/17] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 12/17] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
2020-10-06 14:09   ` Juri Lelli
2020-10-06 14:20     ` Peter Zijlstra
2020-10-06 15:55   ` Qais Yousef
2020-10-07  7:22     ` Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 13/17] sched,rt: Use the full cpumask for balancing Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 14/17] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 15/17] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
2020-10-06  7:59   ` Peter Zijlstra
2020-10-06 13:44     ` Steven Rostedt
2020-10-06 13:50       ` Peter Zijlstra
2020-10-06 11:20   ` Valentin Schneider
2020-10-06 13:48     ` Peter Zijlstra
2020-10-06 14:37       ` Juri Lelli
2020-10-06 14:48         ` Peter Zijlstra
2020-10-07  5:29           ` Juri Lelli
2020-10-06 16:19       ` Valentin Schneider
2020-10-07 19:22   ` Valentin Schneider
2020-10-07 21:08     ` Peter Zijlstra
2020-10-07 22:32       ` Valentin Schneider
2020-10-08 10:48   ` Sebastian Andrzej Siewior
2020-10-12  9:56   ` Dietmar Eggemann
2020-10-12 11:28     ` Peter Zijlstra
2020-10-12 12:22       ` Dietmar Eggemann
2020-10-05 14:57 ` [PATCH -v2 16/17] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
2020-10-05 14:57 ` [PATCH -v2 17/17] sched: Add migrate_disable() tracepoints Peter Zijlstra
2020-10-13 14:01 ` [PATCH 1/2] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Valentin Schneider
2020-10-13 14:15   ` Sebastian Andrzej Siewior
2020-10-13 14:21     ` Peter Zijlstra
2020-10-15  9:14       ` Valentin Schneider
2020-10-13 14:22     ` Valentin Schneider
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
2020-10-13 14:01 ` [PATCH 2/2] sched: Comment affine_move_task() Valentin Schneider
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Valentin Schneider

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).